normalizer

normalizer

The normalizer property of keyword fields is similar to analyzer except that it guarantees that the analysis chain produces a single token.

The normalizer is applied prior to indexing the keyword, as well as at search-time when the keyword field is searched via a query parser such as the match query or via a term-level query such as the term query.

A simple normalizer called lowercase ships with elasticsearch and can be used. Custom normalizers can be defined as part of analysis settings as follows.

  1. resp = client.indices.create(
  2. index="index",
  3. settings={
  4. "analysis": {
  5. "normalizer": {
  6. "my_normalizer": {
  7. "type": "custom",
  8. "char_filter": [],
  9. "filter": [
  10. "lowercase",
  11. "asciifolding"
  12. ]
  13. }
  14. }
  15. }
  16. },
  17. mappings={
  18. "properties": {
  19. "foo": {
  20. "type": "keyword",
  21. "normalizer": "my_normalizer"
  22. }
  23. }
  24. },
  25. )
  26. print(resp)
  27. resp1 = client.index(
  28. index="index",
  29. id="1",
  30. document={
  31. "foo": "BÀR"
  32. },
  33. )
  34. print(resp1)
  35. resp2 = client.index(
  36. index="index",
  37. id="2",
  38. document={
  39. "foo": "bar"
  40. },
  41. )
  42. print(resp2)
  43. resp3 = client.index(
  44. index="index",
  45. id="3",
  46. document={
  47. "foo": "baz"
  48. },
  49. )
  50. print(resp3)
  51. resp4 = client.indices.refresh(
  52. index="index",
  53. )
  54. print(resp4)
  55. resp5 = client.search(
  56. index="index",
  57. query={
  58. "term": {
  59. "foo": "BAR"
  60. }
  61. },
  62. )
  63. print(resp5)
  64. resp6 = client.search(
  65. index="index",
  66. query={
  67. "match": {
  68. "foo": "BAR"
  69. }
  70. },
  71. )
  72. print(resp6)
  1. response = client.indices.create(
  2. index: 'index',
  3. body: {
  4. settings: {
  5. analysis: {
  6. normalizer: {
  7. my_normalizer: {
  8. type: 'custom',
  9. char_filter: [],
  10. filter: [
  11. 'lowercase',
  12. 'asciifolding'
  13. ]
  14. }
  15. }
  16. }
  17. },
  18. mappings: {
  19. properties: {
  20. foo: {
  21. type: 'keyword',
  22. normalizer: 'my_normalizer'
  23. }
  24. }
  25. }
  26. }
  27. )
  28. puts response
  29. response = client.index(
  30. index: 'index',
  31. id: 1,
  32. body: {
  33. foo: 'BÀR'
  34. }
  35. )
  36. puts response
  37. response = client.index(
  38. index: 'index',
  39. id: 2,
  40. body: {
  41. foo: 'bar'
  42. }
  43. )
  44. puts response
  45. response = client.index(
  46. index: 'index',
  47. id: 3,
  48. body: {
  49. foo: 'baz'
  50. }
  51. )
  52. puts response
  53. response = client.indices.refresh(
  54. index: 'index'
  55. )
  56. puts response
  57. response = client.search(
  58. index: 'index',
  59. body: {
  60. query: {
  61. term: {
  62. foo: 'BAR'
  63. }
  64. }
  65. }
  66. )
  67. puts response
  68. response = client.search(
  69. index: 'index',
  70. body: {
  71. query: {
  72. match: {
  73. foo: 'BAR'
  74. }
  75. }
  76. }
  77. )
  78. puts response
  1. const response = await client.indices.create({
  2. index: "index",
  3. settings: {
  4. analysis: {
  5. normalizer: {
  6. my_normalizer: {
  7. type: "custom",
  8. char_filter: [],
  9. filter: ["lowercase", "asciifolding"],
  10. },
  11. },
  12. },
  13. },
  14. mappings: {
  15. properties: {
  16. foo: {
  17. type: "keyword",
  18. normalizer: "my_normalizer",
  19. },
  20. },
  21. },
  22. });
  23. console.log(response);
  24. const response1 = await client.index({
  25. index: "index",
  26. id: 1,
  27. document: {
  28. foo: "BÀR",
  29. },
  30. });
  31. console.log(response1);
  32. const response2 = await client.index({
  33. index: "index",
  34. id: 2,
  35. document: {
  36. foo: "bar",
  37. },
  38. });
  39. console.log(response2);
  40. const response3 = await client.index({
  41. index: "index",
  42. id: 3,
  43. document: {
  44. foo: "baz",
  45. },
  46. });
  47. console.log(response3);
  48. const response4 = await client.indices.refresh({
  49. index: "index",
  50. });
  51. console.log(response4);
  52. const response5 = await client.search({
  53. index: "index",
  54. query: {
  55. term: {
  56. foo: "BAR",
  57. },
  58. },
  59. });
  60. console.log(response5);
  61. const response6 = await client.search({
  62. index: "index",
  63. query: {
  64. match: {
  65. foo: "BAR",
  66. },
  67. },
  68. });
  69. console.log(response6);
  1. PUT index
  2. {
  3. "settings": {
  4. "analysis": {
  5. "normalizer": {
  6. "my_normalizer": {
  7. "type": "custom",
  8. "char_filter": [],
  9. "filter": ["lowercase", "asciifolding"]
  10. }
  11. }
  12. }
  13. },
  14. "mappings": {
  15. "properties": {
  16. "foo": {
  17. "type": "keyword",
  18. "normalizer": "my_normalizer"
  19. }
  20. }
  21. }
  22. }
  23. PUT index/_doc/1
  24. {
  25. "foo": "BÀR"
  26. }
  27. PUT index/_doc/2
  28. {
  29. "foo": "bar"
  30. }
  31. PUT index/_doc/3
  32. {
  33. "foo": "baz"
  34. }
  35. POST index/_refresh
  36. GET index/_search
  37. {
  38. "query": {
  39. "term": {
  40. "foo": "BAR"
  41. }
  42. }
  43. }
  44. GET index/_search
  45. {
  46. "query": {
  47. "match": {
  48. "foo": "BAR"
  49. }
  50. }
  51. }

The above queries match documents 1 and 2 since BÀR is converted to bar at both index and query time.

  1. {
  2. "took": $body.took,
  3. "timed_out": false,
  4. "_shards": {
  5. "total": 1,
  6. "successful": 1,
  7. "skipped" : 0,
  8. "failed": 0
  9. },
  10. "hits": {
  11. "total" : {
  12. "value": 2,
  13. "relation": "eq"
  14. },
  15. "max_score": 0.4700036,
  16. "hits": [
  17. {
  18. "_index": "index",
  19. "_id": "1",
  20. "_score": 0.4700036,
  21. "_source": {
  22. "foo": "BÀR"
  23. }
  24. },
  25. {
  26. "_index": "index",
  27. "_id": "2",
  28. "_score": 0.4700036,
  29. "_source": {
  30. "foo": "bar"
  31. }
  32. }
  33. ]
  34. }
  35. }

Also, the fact that keywords are converted prior to indexing also means that aggregations return normalized values:

  1. resp = client.search(
  2. index="index",
  3. size=0,
  4. aggs={
  5. "foo_terms": {
  6. "terms": {
  7. "field": "foo"
  8. }
  9. }
  10. },
  11. )
  12. print(resp)
  1. response = client.search(
  2. index: 'index',
  3. body: {
  4. size: 0,
  5. aggregations: {
  6. foo_terms: {
  7. terms: {
  8. field: 'foo'
  9. }
  10. }
  11. }
  12. }
  13. )
  14. puts response
  1. const response = await client.search({
  2. index: "index",
  3. size: 0,
  4. aggs: {
  5. foo_terms: {
  6. terms: {
  7. field: "foo",
  8. },
  9. },
  10. },
  11. });
  12. console.log(response);
  1. GET index/_search
  2. {
  3. "size": 0,
  4. "aggs": {
  5. "foo_terms": {
  6. "terms": {
  7. "field": "foo"
  8. }
  9. }
  10. }
  11. }

returns

  1. {
  2. "took": 43,
  3. "timed_out": false,
  4. "_shards": {
  5. "total": 1,
  6. "successful": 1,
  7. "skipped" : 0,
  8. "failed": 0
  9. },
  10. "hits": {
  11. "total" : {
  12. "value": 3,
  13. "relation": "eq"
  14. },
  15. "max_score": null,
  16. "hits": []
  17. },
  18. "aggregations": {
  19. "foo_terms": {
  20. "doc_count_error_upper_bound": 0,
  21. "sum_other_doc_count": 0,
  22. "buckets": [
  23. {
  24. "key": "bar",
  25. "doc_count": 2
  26. },
  27. {
  28. "key": "baz",
  29. "doc_count": 1
  30. }
  31. ]
  32. }
  33. }
  34. }