March 20, 2014

Elasticsearch Rosetta Stone

I’ve recently done some work with Elasticsearch. It’s a great platform, but I still found it pretty hard to get started. It wasn’t obvious what was what. Perhaps I didn’t read the docs closely enough. Anyhow, I wrote up some examples demonstrating how to get started with Elasticsearch.

Quick Intro

You’ll need to have an elasticsearch running to run these examples. Just download elasticsearch, unzip it, and run bin/elasticsearch. No need to use anything but the defaults.

Some vocabulary: Elasticsearch stores documents with types in indexes (noun). Approximately, document : type : index :: row : table : database. To add a new document, you index (verb) it.

Since elasticsearch uses a REST api, most client libraries reflect this. Most methods require arguments for index, type, and another argument representing the request body, usually some sort of JSON-esque data structure.

Examples

Ok, enough of that. I wrote examples in Ruby, Python, Javascript, and PHP, because people seem to be into that sort of thing. You can find all the samples on Github here

Ruby

require 'elasticsearch'

describe Elasticsearch do
  it "can connect to an elasticsearch server" do
    # Non-java clients connect to Elasticsearch via the REST api,
    # which runs on port 9200 by default.
    client = Elasticsearch::Client.new({
      host: 'localhost',
      port: 9200
    })
  end

  it "can add arbitrary documents to the index" do
    client = Elasticsearch::Client.new

    # Elasticsearch can store arbitrary documents
    post = {
      title: "My Document",
      body: "Hello world."
    }

    # Documents in elasticsearch have an INDEX and a
    # TYPE. You can also specify an ID here if you
    # want to generate your own.
    client.index({
      index: "blog",  # Think "database"
      type: "post",   # Think "table"
      body: post
    })
  end

  describe "search" do
    let(:post_1) do
      {
        "title" => "Hello World",
        "body" => "This is a post"
      }
    end


    let(:post_2) do
      {
        "title" => "Camp Grenada",
        "body" => "Hello mother, hello father."
      }
    end

    before(:each) do

      @client = Elasticsearch::Client.new

      # Drop the whole blog index for testing purposes
      @client.delete_by_query index: "blog", body: {query: {"match_all" => {}}}

      [post_1, post_2].each do |post|
        @client.index index: "blog", type: "post", body: post
      end

      # If the index is not refreshed, we won't find our documents right away.
      @client.perform_request "POST", "/blog/_refresh"
    end

    it "returns both results when searching for 'hello'" do
      query = {
        term: {          # Search for a given term
          _all: "hello"  # in any field (_all is a special wildcard)
        }
      }

      search_result = @client.search({
        index: "blog",
        type: "post",
        body: {query: query}
      })

      # The search result contains some info about itself (this is not exhaustive)
      search_result.should include({
        "timed_out" => false,
        "_shards" => {
          "total" => 5,
          "successful" => 5,
          "failed" => 0
        },
      })

      # The search hits are in "hits"
      search_result.should include "hits"

      # "hits" contains some metadata
      search_result["hits"].should include(
        "total", "max_score", "hits"
      )

      # "Camp Grenada" comes first because it has more hellos
      search_result["hits"]["total"].should eq(2)
      search_result["hits"]["hits"][0]["_source"].should eq(post_2)
      search_result["hits"]["hits"][1]["_source"].should eq(post_1)
    end

    it "returns one result when searching for 'Hello' in the title" do
      query = {
        term: {           # Search for a given term
          title: "hello"  # in the title
        }
      }

      search_result = @client.search({
        index: "blog",
        type: "post",
        body: {query: query}
      })

      search_result["hits"]["total"].should eq(1)
    end
  end
end

Python

from unittest import TestCase, main
from elasticsearch import Elasticsearch


class TestElasticsearch(TestCase):
    def setUp(self):
        self.client = Elasticsearch()

    def test_connect_to_server(self):
        """Non-java clients connect to Elasticsearch via the REST api,
        which runs on port 9200 by default."""

        Elasticsearch(
            host="localhost",
            port=9200
        )

    def test_index_document(self):
        """Elasticsearch can store arbitrary documents"""
        post = {
            "title": "My Document",
            "body": "Hello world."
        }

        # Documents in elasticsearch have an INDEX and a
        # TYPE. You can also specify an ID here if you
        # want to generate your own.
        self.client.index(
            index="blog",  # Think "database"
            doc_type="post",
            body=post
        )

    def test_search_all(self):
        """Elasticsearch can search for your documents in many ways.
        """
        post_1 = {
            "title": "Hello World",
            "body": "This is a post"
        }

        post_2 = {
            "title": "Camp Grenada",
            "body": "Hello mother, hello father."
        }

        self.client.delete_by_query(
            index="blog",
            body={'query': {"match_all": {}}}
        )

        for post in [post_1, post_2]:
            self.client.index(
                index="blog",
                doc_type="post",
                body=post)

        # If the index is not refreshed, we won't find our documents
        # right away.
        self.client.indices.refresh(index="blog")

        query = {
            'term': {            # Search for a given term
                '_all': 'hello'  # in any field (_all is a special wildcard)
            }
        }

        search_result = self.client.search(
            index="blog",
            doc_type="post",
            body={'query': query}
        )

        # The search result contains some info about itself
        # (this is not exhaustive)
        self.assertFalse(search_result['timed_out'])
        self.assertEqual(
            {'total': 5, 'successful': 5, 'failed': 0},
            search_result['_shards'])

        assert "hits" in search_result
        assert "total" in search_result['hits']
        assert "max_score" in search_result['hits']
        assert "hits" in search_result['hits']  # Where the actual hits live

        # "Camp Grenada" comes first because it has more hellos
        self.assertEqual(2, search_result['hits']['total'])
        self.assertEqual(post_2, search_result['hits']['hits'][0]['_source'])
        self.assertEqual(post_1, search_result['hits']['hits'][1]['_source'])

        # We can search by field, too
        query = {
            'term': {            # Search for a given term
                'title': 'hello'  # in any field (_all is a special wildcard)
            }
        }

        search_result = self.client.search(
            index="blog",
            doc_type="post",
            body={'query': query}
        )

        self.assertEqual(1, search_result['hits']['total'])
        self.assertEqual(post_1, search_result['hits']['hits'][0]['_source'])

if __name__ == "__main__":
    main()

Javascript

var elasticsearch = require('elasticsearch');

// Non-java clients connect to Elasticsearch via the REST api,
// which runs on port 9200 by default.
var client = new elasticsearch.Client({
    host: 'localhost:9200'
});

// Elasticsearch can store arbitrary documents
post = {
    title: "My Document",
    body: "Hello world."
};

// Documents in elasticsearch have an INDEX and a
// TYPE. You can also specify an ID here if you
// want to generate your own.
client.index({
    index: "blog",  // Think "database"
    type: "post",   // Think "table"
    body: post
}).then(function(){
    // Drop the whole blog index for testing purposes
    return client.deleteByQuery({
      index: "blog",
      body: {query: {"match_all": {}}}
    });
}).then(function(){
    post_1 = {
        "title": "Hello World",
        "body": "This is a post"
    };

    return client.index({index: "blog", type: "post", body: post_1});
}).then(function(){
    post_2 = {
        "title": "Camp Grenada",
        "body": "Hello mother, hello father."
    };
    return client.index({index: "blog", type: "post", body: post_2});
}).then(function(){
    // If the index is not refreshed, we won't find our documents right away.
    return client.indices.refresh({index: "blog"});
}).then(function(){

    query = {
        term: {          // Search for a given term
            _all: "hello"  // in any field (_all is a special wildcard)
        }
    };

    returned = 0;
    return client.search({
        index: "blog",
        type: "post",
        body: {query: query}
    });
}).then(function(search_result){

    console.assert(!search_result.timed_out);

    console.assert(search_result._shards.total === 5);
    console.assert(search_result._shards.successful === 5);
    console.assert(search_result._shards.failed === 0);

    console.assert(search_result.hits.total === 2);
    actual = search_result.hits.hits[0]._source;
    console.assert(actual.title == post_2.title);
    console.assert(actual.body == post_2.body);

    actual = search_result.hits.hits[1]._source;
    console.assert(actual.title == post_1.title);
    console.assert(actual.body == post_1.body);
}).then(function(){
    query = {
        term: {           // Search for a given term
            title: "hello"  // in the title
        }
    };

    return client.search({
        index: "blog",
        type: "post",
        body: {query: query}
    });
}).then(function(search_result){
    console.assert(search_result.hits.total === 1);
    console.log("Ok.");
}, function(e){
    console.log("FAILED");
});

PHP

<?php
require 'vendor/autoload.php';

# Non-java clients connect to Elasticsearch via the REST api,
# which runs on port 9200 by default."""
$client = new Elasticsearch\Client(array(
    'hosts' => ['localhost:9200']
));


# Elasticsearch can store arbitrary documents
$post = array(
    "title" => "My Document",
    "body" => "Hello world."
);

# Documents in elasticsearch have an INDEX and a
# TYPE. You can also specify an ID here if you
# want to generate your own.
$client->index(array(
    'index' => 'blog',  # Think "database"
    'type' => "post",  # Think "table"
    'body' => $post
));

# Delete that post
$client->deleteByQuery(array(
    "index" => "blog",
    "body" => array(
      'query' => array(
        "match_all" => array()))));


$post_1 = array(
    "title" => "Hello World",
    "body" => "This is a post"
);

$post_2 = array(
    "title" => "Camp Grenada",
    "body" => "Hello mother, hello father."
);

# Index those documents
$client->index(array(
    'index' => 'blog',
    'type' => "post",
    'body' => $post_1));

$client->index(array(
    'index' => 'blog',
    'type' => "post",
    'body' => $post_2));

# If the index is not refreshed, we won't find our documents
# right away.
$client->indices()->refresh(array("index" => "blog"));

$query = array(
    'term' => array(            # Search for a given term
        '_all' => 'hello'  # in any field (_all is a special wildcard)
    )
);

$search_result = $client->search(array(
    "index" => "blog",
    "type" => "post",
    "body" => array('query' => $query)
));

# The search result contains some info about itself
# (this is not exhaustive)
assert(!$search_result['timed_out']);

assert($search_result['_shards']['total'] === 5);
assert($search_result['_shards']['successful'] === 5);
assert($search_result['_shards']['failed'] === 0);

assert($search_result["hits"]);

assert($search_result["hits"]["total"] === 2);
assert($search_result["hits"]["hits"][0]['_source'] == $post_2);
assert($search_result["hits"]["hits"][1]['_source'] == $post_1);

echo "Ok.\n";