Added new function ContainedInList and enhanced Concordancer

dmcbeath · dmcbeath · commit 85c7f12d858a · 2021-09-02T14:33:03.000-04:00
Now have the ability to highlight in a variety of colors with Concordancer.
diff --git a/AQPython/Concordancers.py b/AQPython/Concordancers.py
@@ -29,7 +29,7 @@ def _buildHTML(text):
     return "<html><body><table border='1' style='font-family: monospace;table-layout: fixed;'>" + text + "</table></body></html>"   
 
 
-def Concordancer(results, textMnt, nrows=10, offset=0, highlightAnnotations=None):
+def Concordancer(results, textMnt, nrows=10, offset=0, highlightAnnotations=None, colorPropertyKey="", colorMap={}):
   """Output HTML for the text identified by the AQAnnotation and highlight in 'red' the text that was ignored (excluded).
 
   Args:
@@ -38,7 +38,8 @@ def Concordancer(results, textMnt, nrows=10, offset=0, highlightAnnotations=None
     nrows: Number of results to display
     offset: Number of characters before/after each annotation in results to display
     highlightAnnotations: Dataframe of AQAnnotations that you would like to highlight in the results
-
+    colorPropertyKey: Key in the property map of highlightAnnotations to get the value for the color lookup in the specified colorMap
+    colorMap: Map the colorPropertyKey value to the specified color in the Map.  Default is blue when not found.
   Returns:
     HTML
 
@@ -74,7 +75,10 @@ def Concordancer(results, textMnt, nrows=10, offset=0, highlightAnnotations=None
       if highlightTokens != None:
         hlTokens = [ hlToken for hlToken in highlightTokens if hlToken.docId == annot.docId and hlToken.startOffset >= annot.startOffset and hlToken.endOffset <= annot.endOffset ]
         for hlToken in hlTokens:
-          hlToks.append((hlToken.startOffset,hlToken.startOffset,"hl", "<font color='blue'>"))
+          hlColor = 'blue'
+          if (hlToken.properties != None) and (colorPropertyKey in hlToken.properties):
+            hlColor = colorMap.get(hlToken.properties[colorPropertyKey], 'blue')
+          hlToks.append((hlToken.startOffset,hlToken.startOffset,"hl", "<font color='" + hlColor + "'>"))
           hlToks.append((hlToken.endOffset,hlToken.endOffset,"hl", "</font>")) 
       
       # Process the excludeTokens
diff --git a/AQPython/Query.py b/AQPython/Query.py
@@ -280,6 +280,66 @@ def ContainedIn(left, right, limit=0, negate=False):
   return results
 
 
+def ContainedInList(left, right):
+  """Provide the ability to find annotations that are contained by another annotation.  
+  The input is 2 Dataframes of AQAnnotations.  We will call them A and B.  
+  The purpose is to find those annotations in A that are contained in B.  What that means is the start/end offset for an annotation from A  must be contained by the start/end offset from an annotation in  B.  
+  We of course have to also match on the document id.  
+  We ultimately return a Dataframe with 2 fields where the first field is an annotation from B and the second field is an array of entries from A
+  that are contained in the first entry.   
+
+  Args:
+  left: Dataframe of AQAnnotations, the ones we will return (as a list) if they are contained in AQAnnotations from 'right'.
+  right: Dataframe of AQAnnotations, the ones we are looking to see if they contain AQAnnotations from 'left'.
+  
+  Returns:
+    Dataframe of (AQAnnotations,Array[AQAnnotations])
+  """
+
+  def containedAQ(rec):
+    # Sort the contained annotations 
+    srecs = sorted(rec[1], key=lambda x: (-1 if x.LendOffset == None else x.LendOffset),reverse=True)
+
+    # We can extract the key from the any 'right' entry in sorted recs (we will use the first one)
+    key = Row(docId = srecs[0].RdocId,
+              annotSet = srecs[0].RannotSet,
+              annotType = srecs[0].RannotType,
+              startOffset = int(srecs[0].RstartOffset),
+              endOffset = int(srecs[0].RendOffset),
+              annotId = int(srecs[0].RannotId),
+              properties = srecs[0].Rproperties)
+
+    # Construct the array
+    values = []
+    for rec in srecs:
+      if rec.LdocId != None:
+        values.append(Row(docId = rec.LdocId,
+                          annotSet = rec.LannotSet,
+                          annotType = rec.LannotType,
+                          startOffset = int(rec.LstartOffset),
+                          endOffset = int(rec.LendOffset),
+                          annotId = int(rec.LannotId),
+                          properties = rec.Lproperties))
+    return(key,values)  
+
+  l = left.select("annotId","annotSet","annotType","docId","endOffset","properties","startOffset").toDF("LannotId","LannotSet","LannotType","LdocId","LendOffset","Lproperties","LstartOffset")
+  r = right.select("annotId","annotSet","annotType","docId","endOffset","properties","startOffset").toDF("RannotId","RannotSet","RannotType","RdocId","RendOffset","Rproperties","RstartOffset")
+
+  results = l.join(r,
+                   ((col("LdocId") == col("RdocId")) &
+                    (col("LstartOffset") >= col("RstartOffset")) &
+                    (col("LendOffset") <= col("RendOffset")) &
+                    (~((col("LannotSet") == col("RannotSet")) &
+                      (col("LannotType") == col("RannotType")) &
+                      (col("LstartOffset") == col("RstartOffset")) &
+                      (col("LendOffset") == col("RendOffset")))))) \
+                    .rdd \
+                    .groupBy(lambda x: (x["RdocId"],x["RstartOffset"],x["RendOffset"])) \
+                    .map(lambda rec: containedAQ(rec))
+
+  return spark.createDataFrame(results.map(lambda x: x),AQSchemaList())
+  
+
 def Before(left, right, dist=sys.maxsize , limit=0, negate=False):
   """Provide the ability to find annotations that are before another annotation.
 
diff --git a/setup.py b/setup.py
@@ -13,7 +13,7 @@
 
 setup(
     name='AnnotationQueryPython-spark3',
-    version='1.0.4',
+    version='1.0.5',
     description='Python implementation for AnnotationQuery',
     long_description=readme,
     author='Darin McBeath',
diff --git a/tests/test_query.py b/tests/test_query.py
@@ -98,6 +98,31 @@ def test_ContainedIn1(self):
     def test_ContainedIn2(self):
         self.assertEquals(2,ContainedIn(FilterType(self.annots, "sentence"), FilterType(self.annots, "ce:para"), negate = True).count())       
 
+    # Test ContainedInList
+    def test_ContainedInList1(self):
+        result = ContainedInList(FilterProperty(self.annots,'orig','polynomial'),FilterType(self.annots, 'sentence')) \
+                 .collect()
+        sortedResults =  sorted(result, key=lambda tup: (tup[0]["startOffset"],tup[0]["endOffset"]))
+        self.assertEquals(1, len(sortedResults[0][1]))
+        self.assertEquals(Row(annotId=1, annotSet='ge', annotType='sentence', docId='S0022314X13001777', endOffset=18607, properties={}, startOffset=18546),
+                          Row(annotId=sortedResults[0][0].annotId,
+                              annotSet=sortedResults[0][0].annotSet,
+                              annotType=sortedResults[0][0].annotType,
+                              docId=sortedResults[0][0].docId,
+                              endOffset=sortedResults[0][0].endOffset,
+                              properties=sortedResults[0][0].properties,
+                              startOffset=sortedResults[0][0].startOffset))  
+        self.assertEquals(Row(annotId=7, annotSet='ge', annotType='word', docId='S0022314X13001777', endOffset=18574, properties={'lemma': 'polynomial', 'orig': 'polynomial', 'pos': 'jj'}, startOffset=18564),
+                          Row(annotId=sortedResults[0][1][0].annotId,
+                              annotSet=sortedResults[0][1][0].annotSet,
+                              annotType=sortedResults[0][1][0].annotType,
+                              docId=sortedResults[0][1][0].docId,
+                              endOffset=sortedResults[0][1][0].endOffset,
+                              properties=sortedResults[0][1][0].properties,
+                              startOffset=sortedResults[0][1][0].startOffset))
+
+                             
+
     # Test Before
     def test_Before1(self):
         self.assertEquals(47,Before(FilterProperty(self.annots, "orig", "polynomial"), FilterProperty(self.annots, "orig", "function")).count())