Skip to content

Commit da553b1

Browse files
authored
Fix a bug in significant_terms (#127975)
Fix a bug in the `significant_terms` agg where the "subsetSize" array is too small because we never collect the ordinal for the agg "above" it. This mostly hits when the you do a `range` agg containing a `significant_terms` AND you only collect the first few ranges. `range` isn't particularly popular, but `date_histogram` is super popular and it rewrites into a `range` pretty commonly - so that's likely what's really hitting this - a `date_histogram` followed by a `significant_text` where the matches are all early in the date range held by the shard.
1 parent 5447909 commit da553b1

File tree

5 files changed

+268
-4
lines changed

5 files changed

+268
-4
lines changed

docs/changelog/127975.yaml

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 127975
2+
summary: Fix a bug in `significant_terms`
3+
area: Aggregations
4+
type: bug
5+
issues: []

modules/aggregations/src/yamlRestTest/resources/rest-api-spec/test/aggregations/sig_terms.yml

+252-1
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@
7373
- match: {aggregations.class.buckets.1.sig_terms.buckets.0.key: "good"}
7474

7575
---
76-
"Test background filter count ":
76+
"Test background filter count":
7777
- requires:
7878
cluster_features: ["gte_v7.15.0"]
7979
reason: bugfix introduced in 7.15.0
@@ -153,6 +153,257 @@
153153
index: goodbad*
154154
body: {"aggs": {"sig_terms": {"significant_terms": {"field": "text", "background_filter": {"bool": {"filter": [{"term": {"class": "good" }}]}}}}}}
155155
- match: { aggregations.sig_terms.bg_count: 2 }
156+
157+
---
158+
"Test background filter count as sub - global ords":
159+
- requires:
160+
capabilities:
161+
- method: POST
162+
path: /_search
163+
capabilities: [ significant_terms_background_filter_as_sub ]
164+
test_runner_features: capabilities
165+
reason: "bug fix"
166+
167+
- do:
168+
indices.create:
169+
index: goodbad
170+
body:
171+
settings:
172+
number_of_shards: 1
173+
mappings:
174+
properties:
175+
text:
176+
type: keyword
177+
class:
178+
type: keyword
179+
- do:
180+
indices.create:
181+
index: goodbad-2
182+
body:
183+
settings:
184+
number_of_shards: 1
185+
mappings:
186+
properties:
187+
text:
188+
type: keyword
189+
class:
190+
type: keyword
191+
192+
- do:
193+
index:
194+
index: goodbad-2
195+
id: "1"
196+
body: { group: 1, class: "bad" }
197+
- do:
198+
index:
199+
index: goodbad-2
200+
id: "2"
201+
body: { group: 1, class: "bad" }
202+
203+
- do:
204+
index:
205+
index: goodbad
206+
id: "1"
207+
body: { group: 1, text: "good", class: "good" }
208+
- do:
209+
index:
210+
index: goodbad
211+
id: "2"
212+
body: { group: 1, text: "good", class: "good" }
213+
- do:
214+
index:
215+
index: goodbad
216+
id: "3"
217+
body: { group: 1, text: "bad", class: "bad" }
218+
- do:
219+
index:
220+
index: goodbad
221+
id: "4"
222+
body: { group: 2, text: "bad", class: "bad" }
223+
224+
- do:
225+
indices.refresh:
226+
index: [goodbad, goodbad-2]
227+
228+
- do:
229+
search:
230+
rest_total_hits_as_int: true
231+
index: goodbad*
232+
- match: {hits.total: 6}
233+
234+
- do:
235+
search:
236+
index: goodbad*
237+
body:
238+
aggs:
239+
group:
240+
range:
241+
field: group
242+
ranges:
243+
# Having many ranges helps catch an issue building no hits buckets
244+
- to: 1
245+
- from: 1
246+
to: 2
247+
- from: 2
248+
to: 3
249+
- from: 3
250+
to: 4
251+
- from: 4
252+
to: 5
253+
- from: 5
254+
to: 6
255+
aggs:
256+
sig_terms:
257+
significant_terms:
258+
execution_hint: global_ordinals
259+
field: text
260+
background_filter:
261+
bool:
262+
filter: [{term: {class: good }}]
263+
- match: { aggregations.group.buckets.0.key: "*-1.0" }
264+
- match: { aggregations.group.buckets.0.sig_terms.doc_count: 0 }
265+
- match: { aggregations.group.buckets.0.sig_terms.bg_count: 2 }
266+
- match: { aggregations.group.buckets.1.key: "1.0-2.0" }
267+
- match: { aggregations.group.buckets.1.sig_terms.doc_count: 5 }
268+
- match: { aggregations.group.buckets.1.sig_terms.bg_count: 2 }
269+
- match: { aggregations.group.buckets.2.key: "2.0-3.0" }
270+
- match: { aggregations.group.buckets.2.sig_terms.doc_count: 1 }
271+
- match: { aggregations.group.buckets.2.sig_terms.bg_count: 2 }
272+
- match: { aggregations.group.buckets.3.key: "3.0-4.0" }
273+
- match: { aggregations.group.buckets.3.sig_terms.doc_count: 0 }
274+
- match: { aggregations.group.buckets.3.sig_terms.bg_count: 2 }
275+
- match: { aggregations.group.buckets.4.key: "4.0-5.0" }
276+
- match: { aggregations.group.buckets.4.sig_terms.doc_count: 0 }
277+
- match: { aggregations.group.buckets.4.sig_terms.bg_count: 2 }
278+
- match: { aggregations.group.buckets.5.key: "5.0-6.0" }
279+
- match: { aggregations.group.buckets.5.sig_terms.doc_count: 0 }
280+
- match: { aggregations.group.buckets.5.sig_terms.bg_count: 2 }
281+
282+
---
283+
"Test background filter count as sub - map":
284+
- requires:
285+
capabilities:
286+
- method: POST
287+
path: /_search
288+
capabilities: [ significant_terms_background_filter_as_sub ]
289+
test_runner_features: capabilities
290+
reason: "bug fix"
291+
292+
- do:
293+
indices.create:
294+
index: goodbad
295+
body:
296+
settings:
297+
number_of_shards: 1
298+
mappings:
299+
properties:
300+
text:
301+
type: keyword
302+
class:
303+
type: keyword
304+
- do:
305+
indices.create:
306+
index: goodbad-2
307+
body:
308+
settings:
309+
number_of_shards: 1
310+
mappings:
311+
properties:
312+
text:
313+
type: keyword
314+
class:
315+
type: keyword
316+
317+
- do:
318+
index:
319+
index: goodbad-2
320+
id: "1"
321+
body: { group: 1, class: "bad" }
322+
- do:
323+
index:
324+
index: goodbad-2
325+
id: "2"
326+
body: { group: 1, class: "bad" }
327+
328+
- do:
329+
index:
330+
index: goodbad
331+
id: "1"
332+
body: { group: 1, text: "good", class: "good" }
333+
- do:
334+
index:
335+
index: goodbad
336+
id: "2"
337+
body: { group: 1, text: "good", class: "good" }
338+
- do:
339+
index:
340+
index: goodbad
341+
id: "3"
342+
body: { group: 1, text: "bad", class: "bad" }
343+
- do:
344+
index:
345+
index: goodbad
346+
id: "4"
347+
body: { group: 2, text: "bad", class: "bad" }
348+
349+
- do:
350+
indices.refresh:
351+
index: [goodbad, goodbad-2]
352+
353+
- do:
354+
search:
355+
rest_total_hits_as_int: true
356+
index: goodbad*
357+
- match: {hits.total: 6}
358+
359+
- do:
360+
search:
361+
index: goodbad*
362+
body:
363+
aggs:
364+
group:
365+
range:
366+
field: group
367+
ranges:
368+
# Having many ranges helps catch an issue building no hits buckets
369+
- to: 1
370+
- from: 1
371+
to: 2
372+
- from: 2
373+
to: 3
374+
- from: 3
375+
to: 4
376+
- from: 4
377+
to: 5
378+
- from: 5
379+
to: 6
380+
aggs:
381+
sig_terms:
382+
significant_terms:
383+
execution_hint: map
384+
field: text
385+
background_filter:
386+
bool:
387+
filter: [{term: {class: good }}]
388+
- match: { aggregations.group.buckets.0.key: "*-1.0" }
389+
- match: { aggregations.group.buckets.0.sig_terms.doc_count: 0 }
390+
- match: { aggregations.group.buckets.0.sig_terms.bg_count: 2 }
391+
- match: { aggregations.group.buckets.1.key: "1.0-2.0" }
392+
- match: { aggregations.group.buckets.1.sig_terms.doc_count: 5 }
393+
- match: { aggregations.group.buckets.1.sig_terms.bg_count: 2 }
394+
- match: { aggregations.group.buckets.2.key: "2.0-3.0" }
395+
- match: { aggregations.group.buckets.2.sig_terms.doc_count: 1 }
396+
- match: { aggregations.group.buckets.2.sig_terms.bg_count: 2 }
397+
- match: { aggregations.group.buckets.3.key: "3.0-4.0" }
398+
- match: { aggregations.group.buckets.3.sig_terms.doc_count: 0 }
399+
- match: { aggregations.group.buckets.3.sig_terms.bg_count: 2 }
400+
- match: { aggregations.group.buckets.4.key: "4.0-5.0" }
401+
- match: { aggregations.group.buckets.4.sig_terms.doc_count: 0 }
402+
- match: { aggregations.group.buckets.4.sig_terms.bg_count: 2 }
403+
- match: { aggregations.group.buckets.5.key: "5.0-6.0" }
404+
- match: { aggregations.group.buckets.5.sig_terms.doc_count: 0 }
405+
- match: { aggregations.group.buckets.5.sig_terms.bg_count: 2 }
406+
156407
---
157408
"IP test":
158409
- do:

server/src/main/java/org/elasticsearch/rest/action/search/SearchCapabilities.java

+3
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@ private SearchCapabilities() {}
4848

4949
private static final String INDEX_SELECTOR_SYNTAX = "index_expression_selectors";
5050

51+
private static final String SIGNIFICANT_TERMS_BACKGROUND_FILTER_AS_SUB = "significant_terms_background_filter_as_sub";
52+
5153
public static final Set<String> CAPABILITIES;
5254
static {
5355
HashSet<String> capabilities = new HashSet<>();
@@ -66,6 +68,7 @@ private SearchCapabilities() {}
6668
capabilities.add(KQL_QUERY_SUPPORTED);
6769
capabilities.add(HIGHLIGHT_MAX_ANALYZED_OFFSET_DEFAULT);
6870
capabilities.add(INDEX_SELECTOR_SYNTAX);
71+
capabilities.add(SIGNIFICANT_TERMS_BACKGROUND_FILTER_AS_SUB);
6972
CAPABILITIES = Set.copyOf(capabilities);
7073
}
7174
}

server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -1087,7 +1087,7 @@ SignificantStringTerms buildEmptyResult() {
10871087

10881088
@Override
10891089
SignificantStringTerms buildNoValuesResult(long owningBucketOrdinal) {
1090-
return buildEmptySignificantTermsAggregation(subsetSizes.get(owningBucketOrdinal), supersetSize, significanceHeuristic);
1090+
return buildEmptySignificantTermsAggregation(subsetSize(owningBucketOrdinal), supersetSize, significanceHeuristic);
10911091
}
10921092

10931093
@Override

server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/MapStringTermsAggregator.java

+7-2
Original file line numberDiff line numberDiff line change
@@ -649,7 +649,7 @@ ObjectArrayPriorityQueue<BucketAndOrd<SignificantStringTerms.Bucket>> buildPrior
649649

650650
@Override
651651
BucketUpdater<SignificantStringTerms.Bucket> bucketUpdater(long owningBucketOrd) {
652-
long subsetSize = subsetSizes.get(owningBucketOrd);
652+
long subsetSize = subsetSize(owningBucketOrd);
653653
return (spare, ordsEnum, docCount) -> {
654654
ordsEnum.readValue(spare.termBytes);
655655
spare.subsetDf = docCount;
@@ -696,7 +696,7 @@ SignificantStringTerms buildResult(long owningBucketOrd, long otherDocCount, Sig
696696
bucketCountThresholds.getMinDocCount(),
697697
metadata(),
698698
format,
699-
subsetSizes.get(owningBucketOrd),
699+
subsetSize(owningBucketOrd),
700700
supersetSize,
701701
significanceHeuristic,
702702
Arrays.asList(topBuckets)
@@ -712,5 +712,10 @@ SignificantStringTerms buildEmptyResult() {
712712
public void close() {
713713
Releasables.close(backgroundFrequencies, subsetSizes);
714714
}
715+
716+
private long subsetSize(long owningBucketOrd) {
717+
// if the owningBucketOrd is not in the array that means the bucket is empty so the size has to be 0
718+
return owningBucketOrd < subsetSizes.size() ? subsetSizes.get(owningBucketOrd) : 0;
719+
}
715720
}
716721
}

0 commit comments

Comments
 (0)