Fix a bug in significant_terms (#127975)

nik9000 · web-flow · commit da553b11e356 · 2025-05-09T13:48:19.000-04:00
Fix a bug in the `significant_terms` agg where the "subsetSize" array is
too small because we never collect the ordinal for the agg "above" it.

This mostly hits when the you do a `range` agg containing a
`significant_terms` AND you only collect the first few ranges. `range`
isn't particularly popular, but `date_histogram` is super popular and it
rewrites into a `range` pretty commonly - so that's likely what's really
hitting this - a `date_histogram` followed by a `significant_text` where
the matches are all early in the date range held by the shard.
diff --git a/docs/changelog/127975.yaml b/docs/changelog/127975.yaml
@@ -0,0 +1,5 @@
+pr: 127975
+summary: Fix a bug in `significant_terms`
+area: Aggregations
+type: bug
+issues: []
diff --git a/modules/aggregations/src/yamlRestTest/resources/rest-api-spec/test/aggregations/sig_terms.yml b/modules/aggregations/src/yamlRestTest/resources/rest-api-spec/test/aggregations/sig_terms.yml
@@ -73,7 +73,7 @@
   - match: {aggregations.class.buckets.1.sig_terms.buckets.0.key: "good"}
 
 ---
-"Test background filter count ":
+"Test background filter count":
   - requires:
       cluster_features: ["gte_v7.15.0"]
       reason: bugfix introduced in 7.15.0
@@ -153,6 +153,257 @@
         index: goodbad*
         body: {"aggs": {"sig_terms": {"significant_terms": {"field": "text", "background_filter": {"bool": {"filter": [{"term": {"class": "good" }}]}}}}}}
   - match: { aggregations.sig_terms.bg_count: 2 }
+
+---
+"Test background filter count as sub - global ords":
+  - requires:
+      capabilities:
+        - method: POST
+          path: /_search
+          capabilities: [ significant_terms_background_filter_as_sub ]
+      test_runner_features: capabilities
+      reason: "bug fix"
+
+  - do:
+      indices.create:
+        index:  goodbad
+        body:
+          settings:
+            number_of_shards: 1
+          mappings:
+            properties:
+              text:
+                type: keyword
+              class:
+                type: keyword
+  - do:
+      indices.create:
+        index: goodbad-2
+        body:
+          settings:
+            number_of_shards: 1
+          mappings:
+            properties:
+              text:
+                type: keyword
+              class:
+                type: keyword
+
+  - do:
+      index:
+        index: goodbad-2
+        id: "1"
+        body: { group: 1, class: "bad" }
+  - do:
+      index:
+        index: goodbad-2
+        id: "2"
+        body: { group: 1, class: "bad" }
+
+  - do:
+      index:
+        index:  goodbad
+        id:     "1"
+        body:   { group: 1, text: "good", class: "good" }
+  - do:
+      index:
+        index:  goodbad
+        id:     "2"
+        body:   { group: 1, text: "good", class: "good" }
+  - do:
+      index:
+        index:  goodbad
+        id:     "3"
+        body:   { group: 1, text: "bad", class: "bad" }
+  - do:
+      index:
+        index:  goodbad
+        id:     "4"
+        body:   { group: 2, text: "bad", class: "bad" }
+
+  - do:
+      indices.refresh:
+        index: [goodbad, goodbad-2]
+
+  - do:
+      search:
+        rest_total_hits_as_int: true
+        index: goodbad*
+  - match: {hits.total: 6}
+
+  - do:
+      search:
+        index: goodbad*
+        body:
+          aggs:
+            group:
+              range:
+                field: group
+                ranges:
+                  # Having many ranges helps catch an issue building no hits buckets
+                  - to: 1
+                  - from: 1
+                    to: 2
+                  - from: 2
+                    to: 3
+                  - from: 3
+                    to: 4
+                  - from: 4
+                    to: 5
+                  - from: 5
+                    to: 6
+              aggs:
+                sig_terms:
+                  significant_terms:
+                    execution_hint: global_ordinals
+                    field: text
+                    background_filter:
+                      bool:
+                        filter: [{term: {class: good }}]
+  - match: { aggregations.group.buckets.0.key: "*-1.0" }
+  - match: { aggregations.group.buckets.0.sig_terms.doc_count: 0 }
+  - match: { aggregations.group.buckets.0.sig_terms.bg_count: 2 }
+  - match: { aggregations.group.buckets.1.key: "1.0-2.0" }
+  - match: { aggregations.group.buckets.1.sig_terms.doc_count: 5 }
+  - match: { aggregations.group.buckets.1.sig_terms.bg_count: 2 }
+  - match: { aggregations.group.buckets.2.key: "2.0-3.0" }
+  - match: { aggregations.group.buckets.2.sig_terms.doc_count: 1 }
+  - match: { aggregations.group.buckets.2.sig_terms.bg_count: 2 }
+  - match: { aggregations.group.buckets.3.key: "3.0-4.0" }
+  - match: { aggregations.group.buckets.3.sig_terms.doc_count: 0 }
+  - match: { aggregations.group.buckets.3.sig_terms.bg_count: 2 }
+  - match: { aggregations.group.buckets.4.key: "4.0-5.0" }
+  - match: { aggregations.group.buckets.4.sig_terms.doc_count: 0 }
+  - match: { aggregations.group.buckets.4.sig_terms.bg_count: 2 }
+  - match: { aggregations.group.buckets.5.key: "5.0-6.0" }
+  - match: { aggregations.group.buckets.5.sig_terms.doc_count: 0 }
+  - match: { aggregations.group.buckets.5.sig_terms.bg_count: 2 }
+
+---
+"Test background filter count as sub - map":
+  - requires:
+      capabilities:
+        - method: POST
+          path: /_search
+          capabilities: [ significant_terms_background_filter_as_sub ]
+      test_runner_features: capabilities
+      reason: "bug fix"
+
+  - do:
+      indices.create:
+        index:  goodbad
+        body:
+          settings:
+            number_of_shards: 1
+          mappings:
+            properties:
+              text:
+                type: keyword
+              class:
+                type: keyword
+  - do:
+      indices.create:
+        index: goodbad-2
+        body:
+          settings:
+            number_of_shards: 1
+          mappings:
+            properties:
+              text:
+                type: keyword
+              class:
+                type: keyword
+
+  - do:
+      index:
+        index: goodbad-2
+        id: "1"
+        body: { group: 1, class: "bad" }
+  - do:
+      index:
+        index: goodbad-2
+        id: "2"
+        body: { group: 1, class: "bad" }
+
+  - do:
+      index:
+        index:  goodbad
+        id:     "1"
+        body:   { group: 1, text: "good", class: "good" }
+  - do:
+      index:
+        index:  goodbad
+        id:     "2"
+        body:   { group: 1, text: "good", class: "good" }
+  - do:
+      index:
+        index:  goodbad
+        id:     "3"
+        body:   { group: 1, text: "bad", class: "bad" }
+  - do:
+      index:
+        index:  goodbad
+        id:     "4"
+        body:   { group: 2, text: "bad", class: "bad" }
+
+  - do:
+      indices.refresh:
+        index: [goodbad, goodbad-2]
+
+  - do:
+      search:
+        rest_total_hits_as_int: true
+        index: goodbad*
+  - match: {hits.total: 6}
+
+  - do:
+      search:
+        index: goodbad*
+        body:
+          aggs:
+            group:
+              range:
+                field: group
+                ranges:
+                  # Having many ranges helps catch an issue building no hits buckets
+                  - to: 1
+                  - from: 1
+                    to: 2
+                  - from: 2
+                    to: 3
+                  - from: 3
+                    to: 4
+                  - from: 4
+                    to: 5
+                  - from: 5
+                    to: 6
+              aggs:
+                sig_terms:
+                  significant_terms:
+                    execution_hint: map
+                    field: text
+                    background_filter:
+                      bool:
+                        filter: [{term: {class: good }}]
+  - match: { aggregations.group.buckets.0.key: "*-1.0" }
+  - match: { aggregations.group.buckets.0.sig_terms.doc_count: 0 }
+  - match: { aggregations.group.buckets.0.sig_terms.bg_count: 2 }
+  - match: { aggregations.group.buckets.1.key: "1.0-2.0" }
+  - match: { aggregations.group.buckets.1.sig_terms.doc_count: 5 }
+  - match: { aggregations.group.buckets.1.sig_terms.bg_count: 2 }
+  - match: { aggregations.group.buckets.2.key: "2.0-3.0" }
+  - match: { aggregations.group.buckets.2.sig_terms.doc_count: 1 }
+  - match: { aggregations.group.buckets.2.sig_terms.bg_count: 2 }
+  - match: { aggregations.group.buckets.3.key: "3.0-4.0" }
+  - match: { aggregations.group.buckets.3.sig_terms.doc_count: 0 }
+  - match: { aggregations.group.buckets.3.sig_terms.bg_count: 2 }
+  - match: { aggregations.group.buckets.4.key: "4.0-5.0" }
+  - match: { aggregations.group.buckets.4.sig_terms.doc_count: 0 }
+  - match: { aggregations.group.buckets.4.sig_terms.bg_count: 2 }
+  - match: { aggregations.group.buckets.5.key: "5.0-6.0" }
+  - match: { aggregations.group.buckets.5.sig_terms.doc_count: 0 }
+  - match: { aggregations.group.buckets.5.sig_terms.bg_count: 2 }
+
 ---
 "IP test":
   - do:
diff --git a/server/src/main/java/org/elasticsearch/rest/action/search/SearchCapabilities.java b/server/src/main/java/org/elasticsearch/rest/action/search/SearchCapabilities.java
@@ -48,6 +48,8 @@ private SearchCapabilities() {}
 
     private static final String INDEX_SELECTOR_SYNTAX = "index_expression_selectors";
 
+    private static final String SIGNIFICANT_TERMS_BACKGROUND_FILTER_AS_SUB = "significant_terms_background_filter_as_sub";
+
     public static final Set<String> CAPABILITIES;
     static {
         HashSet<String> capabilities = new HashSet<>();
@@ -66,6 +68,7 @@ private SearchCapabilities() {}
         capabilities.add(KQL_QUERY_SUPPORTED);
         capabilities.add(HIGHLIGHT_MAX_ANALYZED_OFFSET_DEFAULT);
         capabilities.add(INDEX_SELECTOR_SYNTAX);
+        capabilities.add(SIGNIFICANT_TERMS_BACKGROUND_FILTER_AS_SUB);
         CAPABILITIES = Set.copyOf(capabilities);
     }
 }
diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java
@@ -1087,7 +1087,7 @@ SignificantStringTerms buildEmptyResult() {
 
         @Override
         SignificantStringTerms buildNoValuesResult(long owningBucketOrdinal) {
-            return buildEmptySignificantTermsAggregation(subsetSizes.get(owningBucketOrdinal), supersetSize, significanceHeuristic);
+            return buildEmptySignificantTermsAggregation(subsetSize(owningBucketOrdinal), supersetSize, significanceHeuristic);
         }
 
         @Override
diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/MapStringTermsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/MapStringTermsAggregator.java
@@ -649,7 +649,7 @@ ObjectArrayPriorityQueue<BucketAndOrd<SignificantStringTerms.Bucket>> buildPrior
 
         @Override
         BucketUpdater<SignificantStringTerms.Bucket> bucketUpdater(long owningBucketOrd) {
-            long subsetSize = subsetSizes.get(owningBucketOrd);
+            long subsetSize = subsetSize(owningBucketOrd);
             return (spare, ordsEnum, docCount) -> {
                 ordsEnum.readValue(spare.termBytes);
                 spare.subsetDf = docCount;
@@ -696,7 +696,7 @@ SignificantStringTerms buildResult(long owningBucketOrd, long otherDocCount, Sig
                 bucketCountThresholds.getMinDocCount(),
                 metadata(),
                 format,
-                subsetSizes.get(owningBucketOrd),
+                subsetSize(owningBucketOrd),
                 supersetSize,
                 significanceHeuristic,
                 Arrays.asList(topBuckets)
@@ -712,5 +712,10 @@ SignificantStringTerms buildEmptyResult() {
         public void close() {
             Releasables.close(backgroundFrequencies, subsetSizes);
         }
+
+        private long subsetSize(long owningBucketOrd) {
+            // if the owningBucketOrd is not in the array that means the bucket is empty so the size has to be 0
+            return owningBucketOrd < subsetSizes.size() ? subsetSizes.get(owningBucketOrd) : 0;
+        }
     }
 }

Original file line number	Diff line number	Diff line change
`@@ -48,6 +48,8 @@ private SearchCapabilities() {}`
`48`	`48`
`49`	`49`	`private static final String INDEX_SELECTOR_SYNTAX = "index_expression_selectors";`
`50`	`50`
	`51`	`+ private static final String SIGNIFICANT_TERMS_BACKGROUND_FILTER_AS_SUB = "significant_terms_background_filter_as_sub";`
	`52`	`+`
`51`	`53`	`public static final Set<String> CAPABILITIES;`
`52`	`54`	`static {`
`53`	`55`	`HashSet<String> capabilities = new HashSet<>();`
`@@ -66,6 +68,7 @@ private SearchCapabilities() {}`
`66`	`68`	`capabilities.add(KQL_QUERY_SUPPORTED);`
`67`	`69`	`capabilities.add(HIGHLIGHT_MAX_ANALYZED_OFFSET_DEFAULT);`
`68`	`70`	`capabilities.add(INDEX_SELECTOR_SYNTAX);`
	`71`	`+ capabilities.add(SIGNIFICANT_TERMS_BACKGROUND_FILTER_AS_SUB);`
`69`	`72`	`CAPABILITIES = Set.copyOf(capabilities);`
`70`	`73`	`}`
`71`	`74`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1087,7 +1087,7 @@ SignificantStringTerms buildEmptyResult() {`
`1087`	`1087`
`1088`	`1088`	`@Override`
`1089`	`1089`	`SignificantStringTerms buildNoValuesResult(long owningBucketOrdinal) {`
`1090`		`- return buildEmptySignificantTermsAggregation(subsetSizes.get(owningBucketOrdinal), supersetSize, significanceHeuristic);`
	`1090`	`+ return buildEmptySignificantTermsAggregation(subsetSize(owningBucketOrdinal), supersetSize, significanceHeuristic);`
`1091`	`1091`	`}`
`1092`	`1092`
`1093`	`1093`	`@Override`