9 years ago · fbad3af352
--- a/docs/reference/how-to.asciidoc
+++ b/docs/reference/how-to.asciidoc
@@ -0,0 +1,22 @@
 
				+[[how-to]]
			
 
				+= How To
			
 
				+
			
 
				+[partintro]
			
 
				+--
			
 
				+Elasticsearch ships with defaults which are intended to give a good out of
			
 
				+the box experience. Full text search, highlighting, aggregations, indexing
			
 
				+should all just work without the user having to change anything.
			
 
				+
			
 
				+Once you better understand how you want to use Elasticsearch, however,
			
 
				+there are a number of optimizations you can make to improve performance
			
 
				+for your use case.
			
 
				+
			
 
				+This section provides guidance about which changes should and shouldn't be
			
 
				+made.
			
 
				+--
			
 
				+
			
 
				+include::how-to/indexing-speed.asciidoc[]
			
 
				+
			
 
				+include::how-to/search-speed.asciidoc[]
			
 
				+
			
 
				+include::how-to/disk-usage.asciidoc[]
			
--- a/docs/reference/how-to/disk-usage.asciidoc
+++ b/docs/reference/how-to/disk-usage.asciidoc
@@ -0,0 +1,159 @@
 
				+[[tune-for-disk-usage]]
			
 
				+== Tune for disk usage
			
 
				+
			
 
				+[float]
			
 
				+=== Disable the features you do not need
			
 
				+
			
 
				+By default elasticsearch indexes and adds doc values to most fields so that they
			
 
				+can be searched and aggregated out of the box. For instance if you have a numeric
			
 
				+field called `foo` that you need to run histograms on but that you never need to
			
 
				+filter on, you can safely disable indexing on this field in your
			
 
				+<<mappings,mappings>>:
			
 
				+
			
 
				+[source,js]
			
 
				+--------------------------------------------------
			
 
				+PUT index
			
 
				+{
			
 
				+  "mappings": {
			
 
				+    "type": {
			
 
				+      "properties": {
			
 
				+        "foo": {
			
 
				+          "type": "integer",
			
 
				+          "index": false
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+--------------------------------------------------
			
 
				+// CONSOLE
			
 
				+
			
 
				+<<text,`text`>> fields store normalization factors in the index in order to be
			
 
				+able to score documents. If you only need matching capabilities on a `text`
			
 
				+field but do not care about the produced scores, you can configure elasticsearch
			
 
				+to not write norms to the index:
			
 
				+
			
 
				+[source,js]
			
 
				+--------------------------------------------------
			
 
				+PUT index
			
 
				+{
			
 
				+  "mappings": {
			
 
				+    "type": {
			
 
				+      "properties": {
			
 
				+        "foo": {
			
 
				+          "type": "text",
			
 
				+          "norms": false
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+--------------------------------------------------
			
 
				+// CONSOLE
			
 
				+
			
 
				+<<text,`text`>> fields also store frequencies and positions in the index by
			
 
				+default. Frequencies are used to compute scores and positions are used to run
			
 
				+phrase queries. If you do not need to run phrase queries, you can tell
			
 
				+elasticsearch to not index positions:
			
 
				+
			
 
				+[source,js]
			
 
				+--------------------------------------------------
			
 
				+PUT index
			
 
				+{
			
 
				+  "mappings": {
			
 
				+    "type": {
			
 
				+      "properties": {
			
 
				+        "foo": {
			
 
				+          "type": "text",
			
 
				+          "index_options": "freqs"
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+--------------------------------------------------
			
 
				+// CONSOLE
			
 
				+
			
 
				+Furthermore if you do not care about scoring either, you can configure
			
 
				+elasticsearch to just index matching documents for every term. You will
			
 
				+still be able to search on this field, but phrase queries will raise errors
			
 
				+and scoring will assume that terms appear only once in every document.
			
 
				+
			
 
				+[source,js]
			
 
				+--------------------------------------------------
			
 
				+PUT index
			
 
				+{
			
 
				+  "mappings": {
			
 
				+    "type": {
			
 
				+      "properties": {
			
 
				+        "foo": {
			
 
				+          "type": "text",
			
 
				+          "norms": false,
			
 
				+          "index_options": "freqs"
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+--------------------------------------------------
			
 
				+// CONSOLE
			
 
				+
			
 
				+[float]
			
 
				+=== Don't use default dynamic string mappings
			
 
				+
			
 
				+The default <<dynamic-mapping,dynamic string mappings>> will index string fields
			
 
				+both as <<text,`text`>> and <<keyword,`keyword`>>. This is wasteful if you only
			
 
				+need one of them. Typically an `id` field will only need to be indexed as a
			
 
				+`keyword` while a `body` field will only need to be indexed as a `text` field.
			
 
				+
			
 
				+This can be disabled by either configuring explicit mappings on string fields
			
 
				+or setting up dynamic templates that will map string fields as either `text`
			
 
				+or `keyword`.
			
 
				+
			
 
				+For instance, here is a template that can be used in order to only map string
			
 
				+fields as `keyword`:
			
 
				+
			
 
				+[source,js]
			
 
				+--------------------------------------------------
			
 
				+PUT index
			
 
				+{
			
 
				+  "mappings": {
			
 
				+    "type": {
			
 
				+      "dynamic_templates": [
			
 
				+        {
			
 
				+          "strings": {
			
 
				+            "match_mapping_type": "string",
			
 
				+            "mapping": {
			
 
				+              "type": "keyword"
			
 
				+            }
			
 
				+          }
			
 
				+        }
			
 
				+      ]
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+--------------------------------------------------
			
 
				+// CONSOLE
			
 
				+
			
 
				+[float]
			
 
				+=== Disable `_all`
			
 
				+
			
 
				+The <<mapping-all-field,`_all`>> field indexes the value of all fields of a
			
 
				+document and can use significant space. If you never need to search against all
			
 
				+fields at the same time, it can be disabled.
			
 
				+
			
 
				+[float]
			
 
				+=== Use `best_compression`
			
 
				+
			
 
				+The `_source` and stored fields can easily take a non negligible amount of disk
			
 
				+space. They can be compressed more aggressively by using the `best_compression`
			
 
				+<<index-codec,codec>>.
			
 
				+
			
 
				+[float]
			
 
				+=== Use the smallest numeric type that is sufficient
			
 
				+
			
 
				+When storing <<number,numeric data>>, using `float` over `double`, or `half_float`
			
 
				+over `float` can help save storage. This is also true for integer types, but less
			
 
				+since Elasticsearch will more easily compress them based on the number of bits
			
 
				+that they actually need.
			
 
				+
			
--- a/docs/reference/how-to/indexing-speed.asciidoc
+++ b/docs/reference/how-to/indexing-speed.asciidoc
@@ -0,0 +1,106 @@
 
				+[[tune-for-indexing-speed]]
			
 
				+== Tune for indexing speed
			
 
				+
			
 
				+[float]
			
 
				+=== Use bulk requests
			
 
				+
			
 
				+Bulk requests will yield much better performance than single-document index
			
 
				+requests. In order to know the optimal size of a bulk request, you shoud run
			
 
				+a benchmark on a single node with a single shard. First try to index 100
			
 
				+documents at once, then 200, then 400, etc. doubling the number of documents
			
 
				+in a bulk request in every benchmark run. When the indexing speed starts to
			
 
				+plateau then you know you reached the optimal size of a bulk request for your
			
 
				+data. In case of tie, it is better to err in the direction of too few rather
			
 
				+than too many documents. Beware that too large bulk requests might put the
			
 
				+cluster under memory pressure when many of them are sent concurrently, so
			
 
				+it is advisable to avoid going beyond a couple tens of megabytes per request
			
 
				+even if larger requests seem to perform better.
			
 
				+
			
 
				+[float]
			
 
				+=== Use multiple workers/threads to send data to elasticsearch
			
 
				+
			
 
				+A single thread sending bulk requests is unlikely to be able to max out the
			
 
				+indexing capacity of an elasticsearch cluster. In order to use all resources
			
 
				+of the cluster, you should send data from multiple threads or processes. In
			
 
				+addition to making better use of the resources of the cluster, this should
			
 
				+help reduce the cost of each fsync.
			
 
				+
			
 
				+Make sure to watch for `TOO_MANY_REQUESTS (429)` response codes
			
 
				+(`EsRejectedExecutionException` with the Java client), which is the way that
			
 
				+elasticsearch tells you that it cannot keep up with the current indexing rate.
			
 
				+When it happens, you should pause ndexing a bit before trying again, ideally
			
 
				+with randomized exponential backoff.
			
 
				+
			
 
				+Similarly to sizing bulk requests, only testing can tell what the optimal
			
 
				+number of workers is. This can be tested by progressivily increasing the
			
 
				+number of workers until either I/O or CPU is saturated on the cluster.
			
 
				+
			
 
				+[float]
			
 
				+=== Increase the refresh interval
			
 
				+
			
 
				+The default <<dynamic-index-settings,`index.refresh_interval`>> is `1s`, which
			
 
				+forces elasticsearch to create a new segment every second.
			
 
				+Increasing this value (to say, `30s`) will allow larger segments to flush and
			
 
				+decreases future merge pressure.
			
 
				+
			
 
				+[float]
			
 
				+=== Disable refresh and replicas for initial loads
			
 
				+
			
 
				+If you need to load a large amount of data at once, you should disable refresh
			
 
				+by setting `index.refresh_interval` to `-1` and set `index.number_of_replicas`
			
 
				+to `0`. This will temporarily put your index at risk since the loss of any shard
			
 
				+will cause data loss, but at the same time indexing will be faster since
			
 
				+documents will be indexed only once. Once the initial loading is finished, you
			
 
				+can set `index.refresh_interval` and `index.number_of_replicas` back to their
			
 
				+original values.
			
 
				+
			
 
				+[float]
			
 
				+=== Disable swapping
			
 
				+
			
 
				+You should make sure that the operating system is not swapping out the java
			
 
				+process by <<setup-configuration-memory,disabling swappping>>.
			
 
				+
			
 
				+[float]
			
 
				+=== Give memory to the filesystem cache
			
 
				+
			
 
				+The filesystem cache will be used in order to buffer I/O operations. You should
			
 
				+make sure to give at least half the memory of the machine running elasticsearch
			
 
				+to the filesystem cache.
			
 
				+
			
 
				+[float]
			
 
				+=== Use faster hardware
			
 
				+
			
 
				+If indexing is I/O bound, you should investigate giving more memory to the
			
 
				+filesystem cache (see above) or buying faster drives. In particular SSD drives
			
 
				+are known to perform better than spinning disks. Always use local storage,
			
 
				+remote filesystems such as `NFS` or `SMB` should be avoided. Also beware of
			
 
				+virtualized storage such as Amazon's `Elastic Block Storage`. Virtualized
			
 
				+storage works very well with Elasticsearch, and it is appealing since it is so
			
 
				+fast and simple to set up, but it is also unfortunately inherently slower on an
			
 
				+ongoing basis when compared to dedicated local storage. If you put an index on
			
 
				+`EBS`, be sure to use provisioned IOPS otherwise operations could be quickly
			
 
				+throttled.
			
 
				+
			
 
				+Stripe your index across multiple SSDs by configuring a RAID 0 array. Remember
			
 
				+that it will increase the risk of failure since the failure of any one SSD
			
 
				+destroys the index. However this is typically the right tradeoff to make:
			
 
				+optimize single shards for maximum performance, and then add replicas across
			
 
				+different nodes so there's redundancy for any node failures. You can also use
			
 
				+<<modules-snapshots,snapshot and restore>> to backup the index for further
			
 
				+insurance.
			
 
				+
			
 
				+[float]
			
 
				+=== Indexing buffer size
			
 
				+
			
 
				+If your node is doing only heavy indexing, be sure
			
 
				+<<indexing-buffer,`indices.memory.index_buffer_size`>> is large enough to give
			
 
				+at most 512 MB indexing buffer per shard doing heavy indexing (beyond that
			
 
				+indexing performance does not typically improve). Elasticsearch takes that
			
 
				+setting (a percentage of the java heap or an absolute byte-size), and
			
 
				+uses it as a shared buffer across all active shards. Very active shards will
			
 
				+naturally use this buffer more than shards that are performing lightweight
			
 
				+indexing.
			
 
				+
			
 
				+The default is `10%` which is often plenty: for example, if you give the JVM
			
 
				+10GB of memory, it will give 1GB to the index buffer, which is enough to host
			
 
				+two shards that are heavily indexing.
			
--- a/docs/reference/how-to/search-speed.asciidoc
+++ b/docs/reference/how-to/search-speed.asciidoc
@@ -0,0 +1,194 @@
 
				+[[tune-for-search-speed]]
			
 
				+== Tune for search speed
			
 
				+
			
 
				+[float]
			
 
				+=== Give memory to the filesystem cache
			
 
				+
			
 
				+Elasticsearch heavily relies on the filesystem cache in order to make search
			
 
				+fast. In general, you should make sure that at least half the available memory
			
 
				+goes to the filesystem cache so that elasticsearch can keep hot regions of the
			
 
				+index in physical memory.
			
 
				+
			
 
				+[float]
			
 
				+=== Use faster hardware
			
 
				+
			
 
				+If your search is I/O bound, you should investigate giving more memory to the
			
 
				+filesystem cache (see above) or buying faster drives. In particular SSD drives
			
 
				+are known to perform better than spinning disks. Always use local storage,
			
 
				+remote filesystems such as `NFS` or `SMB` should be avoided. Also beware of
			
 
				+virtualized storage such as Amazon's `Elastic Block Storage`. Virtualized
			
 
				+storage works very well with Elasticsearch, and it is appealing since it is so
			
 
				+fast and simple to set up, but it is also unfortunately inherently slower on an
			
 
				+ongoing basis when compared to dedicated local storage. If you put an index on
			
 
				+`EBS`, be sure to use provisioned IOPS otherwise operations could be quickly
			
 
				+throttled.
			
 
				+
			
 
				+If your search is CPU-bound, you should investigate buying faster CPUs.
			
 
				+
			
 
				+[float]
			
 
				+=== Document modeling
			
 
				+
			
 
				+Documents should be modeled so that search-time operations are as cheap as possible.
			
 
				+
			
 
				+In particular, joins should be avoided. <<nested,`nested`>> can make queries
			
 
				+several times slower and <<mapping-parent-field,parent-child>> relations can make
			
 
				+queries hundreds of times slower. So if the same questions can be answered without
			
 
				+joins by denormalizing documents, significant speedups can be expected.
			
 
				+
			
 
				+[float]
			
 
				+=== Pre-index data
			
 
				+
			
 
				+You should leverage patterns in your queries to optimize the way data is indexed.
			
 
				+For instance, if all your documents have a `price` field and most queries run
			
 
				+<<search-aggregations-bucket-range-aggregation,`range`>> aggregations on a fixed
			
 
				+list of ranges, you could make this aggregation faster by pre-indexing the ranges
			
 
				+into the index and using a <<search-aggregations-bucket-terms-aggregation,`terms`>>
			
 
				+aggregations.
			
 
				+
			
 
				+For instance, if documents look like:
			
 
				+
			
 
				+[source,js]
			
 
				+--------------------------------------------------
			
 
				+PUT index/type/1
			
 
				+{
			
 
				+  "designation": "spoon",
			
 
				+  "price": 13
			
 
				+}
			
 
				+--------------------------------------------------
			
 
				+// CONSOLE
			
 
				+
			
 
				+and search requests look like:
			
 
				+
			
 
				+[source,js]
			
 
				+--------------------------------------------------
			
 
				+GET index/_search
			
 
				+{
			
 
				+  "aggs": {
			
 
				+    "price_ranges": {
			
 
				+      "range": {
			
 
				+        "field": "price",
			
 
				+        "ranges": [
			
 
				+          { "to": 10 },
			
 
				+          { "from": 10, "to": 100 },
			
 
				+          { "from": 100 }
			
 
				+        ]
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+--------------------------------------------------
			
 
				+// CONSOLE
			
 
				+// TEST[continued]
			
 
				+
			
 
				+Then documents could be enriched by a `price_range` field at index time, which
			
 
				+should be mapped as a <<keyword,`keyword`>>:
			
 
				+
			
 
				+[source,js]
			
 
				+--------------------------------------------------
			
 
				+PUT index
			
 
				+{
			
 
				+  "mappings": {
			
 
				+    "type": {
			
 
				+      "properties": {
			
 
				+        "price_range": {
			
 
				+          "type": "keyword"
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+PUT index/type/1
			
 
				+{
			
 
				+  "designation": "spoon",
			
 
				+  "price": 13,
			
 
				+  "price_range": "10-100"
			
 
				+}
			
 
				+--------------------------------------------------
			
 
				+// CONSOLE
			
 
				+
			
 
				+And then search requests could aggregate this new field rather than running a
			
 
				+`range` aggregation on the `price` field.
			
 
				+
			
 
				+[source,js]
			
 
				+--------------------------------------------------
			
 
				+GET index/_search
			
 
				+{
			
 
				+  "aggs": {
			
 
				+    "price_ranges": {
			
 
				+      "terms": {
			
 
				+        "field": "price_range"
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+--------------------------------------------------
			
 
				+// CONSOLE
			
 
				+// TEST[continued]
			
 
				+
			
 
				+[float]
			
 
				+=== Mappings
			
 
				+
			
 
				+The fact that some data is numeric does not mean it should always be mapped as a
			
 
				+<<number,numeric field>>. Typically, fields storing identifiers such as an `ISBN`
			
 
				+or any number identifying a record from another database, might benefit from
			
 
				+being mapped as <<keyword,`keyword`>> rather than `integer` or `long`.
			
 
				+
			
 
				+[float]
			
 
				+=== Avoid scripts
			
 
				+
			
 
				+In general, scripts should be avoided. If they are absolutely needed, you
			
 
				+should prefer the `painless` and `expressions` engines.
			
 
				+
			
 
				+[float]
			
 
				+=== Force-merge read-only indices
			
 
				+
			
 
				+Indices that are read-only would benefit from being
			
 
				+<<indices-forcemerge,merged down to a single segment>>. This is typically the
			
 
				+case with time-based indices: only the index for the current time frame is
			
 
				+getting new documents while older indices are read-only.
			
 
				+
			
 
				+IMPORTANT: Don't force-merge indices that are still being written to -- leave
			
 
				+merging to the background merge process.
			
 
				+
			
 
				+[float]
			
 
				+=== Warm up global ordinals
			
 
				+
			
 
				+Global ordinals are a data-structure that is used in order to run
			
 
				+<<search-aggregations-bucket-terms-aggregation,`terms`>> aggregations on
			
 
				+<<keyword,`keyword`>> fields. They are loaded lazily in memory because
			
 
				+elasticsearch does not know which fields will be used in `terms` aggregations
			
 
				+and which fields won't. You can tell elasticsearch to load global ordinals
			
 
				+eagerly at refresh-time by configuring mappings as described below:
			
 
				+
			
 
				+[source,js]
			
 
				+--------------------------------------------------
			
 
				+PUT index
			
 
				+{
			
 
				+  "mappings": {
			
 
				+    "type": {
			
 
				+      "properties": {
			
 
				+        "foo": {
			
 
				+          "type": "keyword",
			
 
				+          "eager_global_ordinals": true
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+--------------------------------------------------
			
 
				+// CONSOLE
			
 
				+
			
 
				+[float]
			
 
				+=== Warm up the filesystem cache
			
 
				+
			
 
				+If the machine running elasticsearch is restarted, the filesystem cache will be
			
 
				+empty, so it will take some time before the operating system loads hot regions
			
 
				+of the index into memory so that search operations are fast. You can explicitly
			
 
				+tell the operating system which files should be loaded into memory eagerly
			
 
				+depending on the file extension using the <<file-system,`index.store.preload`>>
			
 
				+setting.
			
 
				+
			
 
				+WARNING: Loading data into the filesystem cache eagerly on too many indices or
			
 
				+too many files will make searh _slower_ if the filesystem cache is not large
			
 
				+enough to hold all the data. Use with caution.
			
--- a/docs/reference/index.asciidoc
+++ b/docs/reference/index.asciidoc
@@ -43,6 +43,8 @@ include::index-modules.asciidoc[]
 
				 
			
 
				 include::ingest.asciidoc[]
			
 
				 
			
 
				+include::how-to.asciidoc[]
			
 
				+
			
 
				 include::testing.asciidoc[]
			
 
				 
			
 
				 include::glossary.asciidoc[]