Elasticsearch Rosetta Stone
I’ve recently done some work with Elasticsearch. It’s a great platform, but I still found it pretty hard to get started. It wasn’t obvious what was what. Perhaps I didn’t read the docs closely enough. Anyhow, I wrote up some examples demonstrating how to get started with Elasticsearch.
Quick Intro
You’ll need to have an elasticsearch running to run these examples. Just
download elasticsearch,
unzip it, and run bin/elasticsearch
. No need to use anything but the defaults.
Some vocabulary: Elasticsearch stores documents with types in indexes (noun).
Approximately, document : type : index :: row : table : database
.
To add a new document, you index (verb) it.
Since elasticsearch uses a REST api, most client libraries reflect this. Most methods require arguments for index, type, and another argument representing the request body, usually some sort of JSON-esque data structure.
Examples
Ok, enough of that. I wrote examples in Ruby, Python, Javascript, and PHP, because people seem to be into that sort of thing. You can find all the samples on Github here
Ruby
require 'elasticsearch'
describe Elasticsearch do
it "can connect to an elasticsearch server" do
# Non-java clients connect to Elasticsearch via the REST api,
# which runs on port 9200 by default.
client = Elasticsearch::Client.new({
host: 'localhost',
port: 9200
})
end
it "can add arbitrary documents to the index" do
client = Elasticsearch::Client.new
# Elasticsearch can store arbitrary documents
post = {
title: "My Document",
body: "Hello world."
}
# Documents in elasticsearch have an INDEX and a
# TYPE. You can also specify an ID here if you
# want to generate your own.
client.index({
index: "blog", # Think "database"
type: "post", # Think "table"
body: post
})
end
describe "search" do
let(:post_1) do
{
"title" => "Hello World",
"body" => "This is a post"
}
end
let(:post_2) do
{
"title" => "Camp Grenada",
"body" => "Hello mother, hello father."
}
end
before(:each) do
@client = Elasticsearch::Client.new
# Drop the whole blog index for testing purposes
@client.delete_by_query index: "blog", body: {query: {"match_all" => {}}}
[post_1, post_2].each do |post|
@client.index index: "blog", type: "post", body: post
end
# If the index is not refreshed, we won't find our documents right away.
@client.perform_request "POST", "/blog/_refresh"
end
it "returns both results when searching for 'hello'" do
query = {
term: { # Search for a given term
_all: "hello" # in any field (_all is a special wildcard)
}
}
search_result = @client.search({
index: "blog",
type: "post",
body: {query: query}
})
# The search result contains some info about itself (this is not exhaustive)
search_result.should include({
"timed_out" => false,
"_shards" => {
"total" => 5,
"successful" => 5,
"failed" => 0
},
})
# The search hits are in "hits"
search_result.should include "hits"
# "hits" contains some metadata
search_result["hits"].should include(
"total", "max_score", "hits"
)
# "Camp Grenada" comes first because it has more hellos
search_result["hits"]["total"].should eq(2)
search_result["hits"]["hits"][0]["_source"].should eq(post_2)
search_result["hits"]["hits"][1]["_source"].should eq(post_1)
end
it "returns one result when searching for 'Hello' in the title" do
query = {
term: { # Search for a given term
title: "hello" # in the title
}
}
search_result = @client.search({
index: "blog",
type: "post",
body: {query: query}
})
search_result["hits"]["total"].should eq(1)
end
end
end
Python
from unittest import TestCase, main
from elasticsearch import Elasticsearch
class TestElasticsearch(TestCase):
def setUp(self):
self.client = Elasticsearch()
def test_connect_to_server(self):
"""Non-java clients connect to Elasticsearch via the REST api,
which runs on port 9200 by default."""
Elasticsearch(
host="localhost",
port=9200
)
def test_index_document(self):
"""Elasticsearch can store arbitrary documents"""
post = {
"title": "My Document",
"body": "Hello world."
}
# Documents in elasticsearch have an INDEX and a
# TYPE. You can also specify an ID here if you
# want to generate your own.
self.client.index(
index="blog", # Think "database"
doc_type="post",
body=post
)
def test_search_all(self):
"""Elasticsearch can search for your documents in many ways.
"""
post_1 = {
"title": "Hello World",
"body": "This is a post"
}
post_2 = {
"title": "Camp Grenada",
"body": "Hello mother, hello father."
}
self.client.delete_by_query(
index="blog",
body={'query': {"match_all": {}}}
)
for post in [post_1, post_2]:
self.client.index(
index="blog",
doc_type="post",
body=post)
# If the index is not refreshed, we won't find our documents
# right away.
self.client.indices.refresh(index="blog")
query = {
'term': { # Search for a given term
'_all': 'hello' # in any field (_all is a special wildcard)
}
}
search_result = self.client.search(
index="blog",
doc_type="post",
body={'query': query}
)
# The search result contains some info about itself
# (this is not exhaustive)
self.assertFalse(search_result['timed_out'])
self.assertEqual(
{'total': 5, 'successful': 5, 'failed': 0},
search_result['_shards'])
assert "hits" in search_result
assert "total" in search_result['hits']
assert "max_score" in search_result['hits']
assert "hits" in search_result['hits'] # Where the actual hits live
# "Camp Grenada" comes first because it has more hellos
self.assertEqual(2, search_result['hits']['total'])
self.assertEqual(post_2, search_result['hits']['hits'][0]['_source'])
self.assertEqual(post_1, search_result['hits']['hits'][1]['_source'])
# We can search by field, too
query = {
'term': { # Search for a given term
'title': 'hello' # in any field (_all is a special wildcard)
}
}
search_result = self.client.search(
index="blog",
doc_type="post",
body={'query': query}
)
self.assertEqual(1, search_result['hits']['total'])
self.assertEqual(post_1, search_result['hits']['hits'][0]['_source'])
if __name__ == "__main__":
main()
Javascript
var elasticsearch = require('elasticsearch');
// Non-java clients connect to Elasticsearch via the REST api,
// which runs on port 9200 by default.
var client = new elasticsearch.Client({
host: 'localhost:9200'
});
// Elasticsearch can store arbitrary documents
post = {
title: "My Document",
body: "Hello world."
};
// Documents in elasticsearch have an INDEX and a
// TYPE. You can also specify an ID here if you
// want to generate your own.
client.index({
index: "blog", // Think "database"
type: "post", // Think "table"
body: post
}).then(function(){
// Drop the whole blog index for testing purposes
return client.deleteByQuery({
index: "blog",
body: {query: {"match_all": {}}}
});
}).then(function(){
post_1 = {
"title": "Hello World",
"body": "This is a post"
};
return client.index({index: "blog", type: "post", body: post_1});
}).then(function(){
post_2 = {
"title": "Camp Grenada",
"body": "Hello mother, hello father."
};
return client.index({index: "blog", type: "post", body: post_2});
}).then(function(){
// If the index is not refreshed, we won't find our documents right away.
return client.indices.refresh({index: "blog"});
}).then(function(){
query = {
term: { // Search for a given term
_all: "hello" // in any field (_all is a special wildcard)
}
};
returned = 0;
return client.search({
index: "blog",
type: "post",
body: {query: query}
});
}).then(function(search_result){
console.assert(!search_result.timed_out);
console.assert(search_result._shards.total === 5);
console.assert(search_result._shards.successful === 5);
console.assert(search_result._shards.failed === 0);
console.assert(search_result.hits.total === 2);
actual = search_result.hits.hits[0]._source;
console.assert(actual.title == post_2.title);
console.assert(actual.body == post_2.body);
actual = search_result.hits.hits[1]._source;
console.assert(actual.title == post_1.title);
console.assert(actual.body == post_1.body);
}).then(function(){
query = {
term: { // Search for a given term
title: "hello" // in the title
}
};
return client.search({
index: "blog",
type: "post",
body: {query: query}
});
}).then(function(search_result){
console.assert(search_result.hits.total === 1);
console.log("Ok.");
}, function(e){
console.log("FAILED");
});
PHP
<?php
require 'vendor/autoload.php';
# Non-java clients connect to Elasticsearch via the REST api,
# which runs on port 9200 by default."""
$client = new Elasticsearch\Client(array(
'hosts' => ['localhost:9200']
));
# Elasticsearch can store arbitrary documents
$post = array(
"title" => "My Document",
"body" => "Hello world."
);
# Documents in elasticsearch have an INDEX and a
# TYPE. You can also specify an ID here if you
# want to generate your own.
$client->index(array(
'index' => 'blog', # Think "database"
'type' => "post", # Think "table"
'body' => $post
));
# Delete that post
$client->deleteByQuery(array(
"index" => "blog",
"body" => array(
'query' => array(
"match_all" => array()))));
$post_1 = array(
"title" => "Hello World",
"body" => "This is a post"
);
$post_2 = array(
"title" => "Camp Grenada",
"body" => "Hello mother, hello father."
);
# Index those documents
$client->index(array(
'index' => 'blog',
'type' => "post",
'body' => $post_1));
$client->index(array(
'index' => 'blog',
'type' => "post",
'body' => $post_2));
# If the index is not refreshed, we won't find our documents
# right away.
$client->indices()->refresh(array("index" => "blog"));
$query = array(
'term' => array( # Search for a given term
'_all' => 'hello' # in any field (_all is a special wildcard)
)
);
$search_result = $client->search(array(
"index" => "blog",
"type" => "post",
"body" => array('query' => $query)
));
# The search result contains some info about itself
# (this is not exhaustive)
assert(!$search_result['timed_out']);
assert($search_result['_shards']['total'] === 5);
assert($search_result['_shards']['successful'] === 5);
assert($search_result['_shards']['failed'] === 0);
assert($search_result["hits"]);
assert($search_result["hits"]["total"] === 2);
assert($search_result["hits"]["hits"][0]['_source'] == $post_2);
assert($search_result["hits"]["hits"][1]['_source'] == $post_1);
echo "Ok.\n";