normalizer
normalizer
The normalizer
property of keyword fields is similar to analyzer except that it guarantees that the analysis chain produces a single token.
The normalizer
is applied prior to indexing the keyword, as well as at search-time when the keyword
field is searched via a query parser such as the match query or via a term-level query such as the term query.
A simple normalizer called lowercase
ships with elasticsearch and can be used. Custom normalizers can be defined as part of analysis settings as follows.
resp = client.indices.create(
index="index",
settings={
"analysis": {
"normalizer": {
"my_normalizer": {
"type": "custom",
"char_filter": [],
"filter": [
"lowercase",
"asciifolding"
]
}
}
}
},
mappings={
"properties": {
"foo": {
"type": "keyword",
"normalizer": "my_normalizer"
}
}
},
)
print(resp)
resp1 = client.index(
index="index",
id="1",
document={
"foo": "BÀR"
},
)
print(resp1)
resp2 = client.index(
index="index",
id="2",
document={
"foo": "bar"
},
)
print(resp2)
resp3 = client.index(
index="index",
id="3",
document={
"foo": "baz"
},
)
print(resp3)
resp4 = client.indices.refresh(
index="index",
)
print(resp4)
resp5 = client.search(
index="index",
query={
"term": {
"foo": "BAR"
}
},
)
print(resp5)
resp6 = client.search(
index="index",
query={
"match": {
"foo": "BAR"
}
},
)
print(resp6)
response = client.indices.create(
index: 'index',
body: {
settings: {
analysis: {
normalizer: {
my_normalizer: {
type: 'custom',
char_filter: [],
filter: [
'lowercase',
'asciifolding'
]
}
}
}
},
mappings: {
properties: {
foo: {
type: 'keyword',
normalizer: 'my_normalizer'
}
}
}
}
)
puts response
response = client.index(
index: 'index',
id: 1,
body: {
foo: 'BÀR'
}
)
puts response
response = client.index(
index: 'index',
id: 2,
body: {
foo: 'bar'
}
)
puts response
response = client.index(
index: 'index',
id: 3,
body: {
foo: 'baz'
}
)
puts response
response = client.indices.refresh(
index: 'index'
)
puts response
response = client.search(
index: 'index',
body: {
query: {
term: {
foo: 'BAR'
}
}
}
)
puts response
response = client.search(
index: 'index',
body: {
query: {
match: {
foo: 'BAR'
}
}
}
)
puts response
const response = await client.indices.create({
index: "index",
settings: {
analysis: {
normalizer: {
my_normalizer: {
type: "custom",
char_filter: [],
filter: ["lowercase", "asciifolding"],
},
},
},
},
mappings: {
properties: {
foo: {
type: "keyword",
normalizer: "my_normalizer",
},
},
},
});
console.log(response);
const response1 = await client.index({
index: "index",
id: 1,
document: {
foo: "BÀR",
},
});
console.log(response1);
const response2 = await client.index({
index: "index",
id: 2,
document: {
foo: "bar",
},
});
console.log(response2);
const response3 = await client.index({
index: "index",
id: 3,
document: {
foo: "baz",
},
});
console.log(response3);
const response4 = await client.indices.refresh({
index: "index",
});
console.log(response4);
const response5 = await client.search({
index: "index",
query: {
term: {
foo: "BAR",
},
},
});
console.log(response5);
const response6 = await client.search({
index: "index",
query: {
match: {
foo: "BAR",
},
},
});
console.log(response6);
PUT index
{
"settings": {
"analysis": {
"normalizer": {
"my_normalizer": {
"type": "custom",
"char_filter": [],
"filter": ["lowercase", "asciifolding"]
}
}
}
},
"mappings": {
"properties": {
"foo": {
"type": "keyword",
"normalizer": "my_normalizer"
}
}
}
}
PUT index/_doc/1
{
"foo": "BÀR"
}
PUT index/_doc/2
{
"foo": "bar"
}
PUT index/_doc/3
{
"foo": "baz"
}
POST index/_refresh
GET index/_search
{
"query": {
"term": {
"foo": "BAR"
}
}
}
GET index/_search
{
"query": {
"match": {
"foo": "BAR"
}
}
}
The above queries match documents 1 and 2 since BÀR
is converted to bar
at both index and query time.
{
"took": $body.took,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped" : 0,
"failed": 0
},
"hits": {
"total" : {
"value": 2,
"relation": "eq"
},
"max_score": 0.4700036,
"hits": [
{
"_index": "index",
"_id": "1",
"_score": 0.4700036,
"_source": {
"foo": "BÀR"
}
},
{
"_index": "index",
"_id": "2",
"_score": 0.4700036,
"_source": {
"foo": "bar"
}
}
]
}
}
Also, the fact that keywords are converted prior to indexing also means that aggregations return normalized values:
resp = client.search(
index="index",
size=0,
aggs={
"foo_terms": {
"terms": {
"field": "foo"
}
}
},
)
print(resp)
response = client.search(
index: 'index',
body: {
size: 0,
aggregations: {
foo_terms: {
terms: {
field: 'foo'
}
}
}
}
)
puts response
const response = await client.search({
index: "index",
size: 0,
aggs: {
foo_terms: {
terms: {
field: "foo",
},
},
},
});
console.log(response);
GET index/_search
{
"size": 0,
"aggs": {
"foo_terms": {
"terms": {
"field": "foo"
}
}
}
}
returns
{
"took": 43,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped" : 0,
"failed": 0
},
"hits": {
"total" : {
"value": 3,
"relation": "eq"
},
"max_score": null,
"hits": []
},
"aggregations": {
"foo_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "bar",
"doc_count": 2
},
{
"key": "baz",
"doc_count": 1
}
]
}
}
}