elsevierlabs-os
diff --git a/‎AQPython/Query.py
+146-1 b/‎AQPython/Query.py
+146-1
diff --git a/‎README.md
+6-1 b/‎README.md
+6-1
diff --git a/‎dist/AnnotationQueryPython-spark3-1.0.3.tar.gz
-15.6 KB b/‎dist/AnnotationQueryPython-spark3-1.0.3.tar.gz
-15.6 KB
diff --git a/‎dist/AnnotationQueryPython_spark3-1.0.3-py3-none-any.whl
-18.1 KB b/‎dist/AnnotationQueryPython_spark3-1.0.3-py3-none-any.whl
-18.1 KB
diff --git a/‎dist/AnnotationQueryPython_spark3-1.0.3-py3.7.egg
-32.5 KB b/‎dist/AnnotationQueryPython_spark3-1.0.3-py3.7.egg
-32.5 KB
diff --git a/‎dist/AnnotationQueryPython_spark3-1.0.4-py3-none-any.whl
-18.2 KB b/‎dist/AnnotationQueryPython_spark3-1.0.4-py3-none-any.whl
-18.2 KB
diff --git a/‎dist/AnnotationQueryPython_spark3-1.0.6-py3-none-any.whl
20.3 KB b/‎dist/AnnotationQueryPython_spark3-1.0.6-py3-none-any.whl
20.3 KB
diff --git a/‎requirements.txt
+1 b/‎requirements.txt
+1
diff --git a/‎setup.py
+1-1 b/‎setup.py
+1-1
diff --git a/‎tests/test_query.py
+27-2 b/‎tests/test_query.py
+27-2
@@ -298,7 +298,8 @@ def ContainedInList(left, right):
 
   def containedAQ(rec):
     # Sort the contained annotations 
-    srecs = sorted(rec[1], key=lambda x: (-1 if x.LendOffset == None else x.LendOffset),reverse=True)
+    #srecs = sorted(rec[1], key=lambda x: (-1 if x.LendOffset == None else x.LendOffset),reverse=True)
+    srecs = sorted(rec[1], key=lambda x: (1 if x.LstartOffset == None else x.LstartOffset),reverse=False)
 
     # We can extract the key from the any 'right' entry in sorted recs (we will use the first one)
     key = Row(docId = srecs[0].RdocId,
@@ -895,3 +896,147 @@ def followingContainedAQ(rec):
 
   else:
     return results
+
+
+  
+def TokensSpan(tokens, spans, tokenProperty):
+  """Provides the ability to create a string from a list of tokens that are contained in a span.
+
+    The specified tokenProperty is used to extract the values from the tokens when creating the string. 
+    For SCNLP, this tokenProperty could be values like 'orig', 'lemma', or 'pos'. The spans would typically be a SCNLP 'sentence' or could even be things like an OM 'ce:para'.
+
+  Args:
+    tokens: Dataframe of AQAnnotations (which we will use to concatenate for the string)
+    spans: Dataframe of AQAnnotations (identifies the start/end for the tokens to be used for the concatenated string)
+    tokenProperty: The property field in the tokens to use for extracting the value for the concatenated string
+
+  Returns:
+     Dataframe[AQAnnotation] spans with 3 new properties all prefixed with the specified tokenProperty value followed by (ToksStr, ToksSpos, ToksEpos) The ToksStr property will be the 
+     concatenated string of token property values contained in the span. The ToksSPos and ToksEpos are properties that will help us determine the start/end offset for each of the individual tokens in the ToksStr. 
+     These helper properties are needed for the function RegexTokensSpan so we can generate accurate accurate start/end offsets based on the str file.
+  """
+  def process(rec):
+    span = rec[0]
+    tokens = rec[1]
+    newProps = {}
+    oldProps = span.properties
+    for key in oldProps.keys():
+      newProps[key] = oldProps[key]
+    toksStr = []
+    toksSpos = []
+    toksEpos = []
+    offset = 0
+    for token in tokens:
+      tokeStr = ""
+      if (token.properties != None) and (tokenProperty in token.properties):
+        tokStr = token.properties[tokenProperty]
+        toksStr.append(tokStr)
+        toksSpos.append((str(offset) + "|" + str(token.startOffset)))
+        offset += len(tokStr)
+        toksEpos.append((str(offset) + "|" + str(token.endOffset)))
+        offset += 1
+    newProps[tokenProperty + "ToksStr"] = " ".join(toksStr)
+    newProps[tokenProperty + "ToksSpos"] =  " ".join(toksSpos)
+    newProps[tokenProperty + "ToksEpos"]  = " ".join(toksEpos)
+
+    return Row(docId = span.docId,
+              annotSet = span.annotSet,
+              annotType = span.annotType,
+              startOffset = span.startOffset,
+              endOffset = span.endOffset,
+              annotId = span.annotId,
+              properties = newProps)
+
+  results = ContainedInList(tokens,spans).rdd.map(lambda rec: process(rec))
+  return spark.createDataFrame(results.map(lambda x: x),AQSchema())
+
+def RegexTokensSpan(tokensSpan, prop, regex, annotSet="",annotType="", annotProps={}):
+  """Provides the ability to apply a regular expression to the concatenated string generated by TokensSpan.
+
+    For the strings matching the regex,a Dataframe[AQAnnotations] will be returned.  
+    The AQAnnotation will correspond to the offsets within the concatenated string containing the match.
+
+  Args:
+    tokensSpan: Datafrane of AQAnnotations (the annotations returned from the TokensSpan function)
+    prop: the property name (orig, lemma, pos) that was used to generate the string for the span in TokensSpan
+    regex: the regular expression to apply to the span
+    annotSet: the value to assign to annotSet for the returned matched annotations (default will be the annotSet from the tokensSpan)
+    annotType: the value to assign to annotType for the returned matched annotations (default will be the annotType from the tokensSpan)
+    annotProps: the additional properties to append to the properties map for the returned matched annotations 
+
+  Returns:
+     Dataframe[AQAnnotation] for the strings matching the regex
+  """
+  def process(partition,prop,regex,annotSet,annotType,annotProps):
+    import regex as re
+    import builtins as py_builtin
+
+    results = []
+    annotId = 0
+    pattern = re.compile(regex)
+
+    for rec in partition:
+      if (rec.properties != None) and (prop+"ToksStr" in rec.properties):
+        span = rec.properties[prop+"ToksStr"]
+        for match in re.finditer(pattern, span):
+          annotId += 1
+          newAnnotSet = annotSet
+          newAnnotType = annotType
+          if (annotSet == ""):
+            newAnnotSet = rec.annotSet
+          if (annotType == ""):
+            newAnnotType = rec.annotType
+          props = {}
+          oldProps = rec.properties
+          for key in annotProps.keys():
+            props[key] = annotProps[key]
+          # start
+          startPos = -1
+          startPosLB = []
+          for start in oldProps[prop+"ToksSpos"].split(" "):
+            startToks = start.split("|")
+            if int(startToks[0]) == match.start():
+              startPos = int(startToks[1])
+            if int(startToks[0]) < match.start():
+              startPosLB.append(int(startToks[1]))
+          if startPos == -1:
+            startPos = py_builtin.max(startPosLB)
+          # end
+          endPos = -1
+          endPosLB = []
+          for end in oldProps[prop+"ToksEpos"].split(" "):
+            endToks = end.split("|")
+            if int(endToks[0]) == match.end():
+              endPos = int(endToks[1])
+            if int(endToks[0]) > match.end():
+              endPosLB.append(int(endToks[1]))
+          if endPos == -1:
+            endPos = py_builtin.min(endPosLB)
+          props[prop+"Match"] = span[match.start():match.end()]
+          # get the excludes from the span (but only include those contained in within the match)
+          for key in oldProps.keys():
+            if key == "excludes":
+              excludesLB = []
+              for exclude in oldProps[key].split("|"):
+                arr = exclude.split(",")
+                excludeStart = int(arr[3])
+                excludeEnd = int(arr[4])
+                if excludeStart >= startPos and excludeEnd <= endPos:
+                  excludesLB.append(exclude)
+              if len(excludesLB):
+                props["excludes"] = "|".join(excludesLB)
+
+          annot = Row(docId = rec.docId,
+                      annotSet = newAnnotSet,
+                      annotType = newAnnotType,
+                      startOffset = startPos,
+                      endOffset = endPos,
+                      annotId = annotId,
+                      properties = props)
+          
+          results.append(annot)
+
+    return iter(results)
+
+  results = tokensSpan.rdd.mapPartitions(lambda partition: process(partition,prop,regex,annotSet,annotType,annotProps))
+  return spark.createDataFrame(results.map(lambda x: x),AQSchema())
@@ -38,7 +38,7 @@ We realize that you can't have a Dataframe[AQAnnotation] like you can with scala
 
 #### Utilities
 
-The GetAQAnnotation and GetCATAnnotation and utility classes have been developed to create an AQAnnotation from the archive format (CATAnnotation) and vise versa.  When creating the AQAnnotation,  the ampersand separated string of name-value pairs in the CATAnnotation other field is mapped to a Map in the AQAnnotation record.  To minimize memory consumption and increase performance, you can specify which name-value pairs to include in the Map.  For more details on the implementation, view the corresponding class for each function in the AQPython Utilities module. For usage examples, view the GetAQAnnotation and GetCATAnnotation classes in the test_utilities module.
+The GetAQAnnotation and GetCATAnnotation and utility classes have been developed to create an AQAnnotation from the archive format (CATAnnotation) and vise versa.  When creating the AQAnnotation,  the ampersand separated string of name-value pairs in the CATAnnotation other field is mapped to a Map in the AQAnnotation record.  To minimize memory consumption and increase performance, you can specify which name-value pairs to include in the Map as well as which ones to decode or lower case.  if you want all name-value pairs to be included in the map, simply specify a value of ["*"] for the parameter in the function..  For more details on the implementation, view the corresponding class for each function in the AQPython Utilities module. For usage examples, view the GetAQAnnotation and GetCATAnnotation classes in the test_utilities module.
 
 
 #### AnnotationQuery Functions
@@ -57,6 +57,8 @@ The following functions are currently provided by AnnotationQuery. Since functio
 
 **ContainedIn**  -  Provide the ability to find annotations that are contained by another annotation. The input is 2 Dataframes of AQAnnotations. We will call them A and B. The purpose is to find those annotations in A that are contained in B. What that means is the start/end offset for an annotation from A must be contained by the start/end offset from an annotation in B. We of course have to also match on the document id. We ultimately return the contained annotations (A) that meet this criteria. There is also the option of negating the query (think Not Contains) so that we return only A where it is not contained in B.
 
+**ContainedInList**  -  Provide the ability to find annotations that are contained by another annotation.  The input is 2 Dataframes of AQAnnotations.  We will call them A and B.  The purpose is to find those annotations in A that are contained in B.  What that means is the start/end offset for an annotation from A  must be contained by the start/end offset from an annotation in  B. We of course have to also match on the document id. We ultimately return a Dataframe with 2 fields where the first field is an annotation from B and the second field is an array of entries from A that are contained in the first entry.
+
 **Before**  -  Provide the ability to find annotations that are before another annotation. The input is 2 Dataframes of AQAnnotations. We will call them A and B. The purpose is to find those annotations in A that are before B. What that means is the end offset for an annotation from A must be before the start offset from an annotation in B. We of course have to also match on the document id. We ultimately return the A annotations that meet this criteria. A distance operator can also be optionally specified. This would require an A annotation (endOffset) to occur n characters (or less) before the B annotation (startOffset). There is also the option of negating the query (think Not Before) so that we return only A where it is not before B.
 
 **After**  -  Provide the ability to find annotations that are after another annotation. The input is 2 Dataframes of AQAnnotations. We will call them A and B. The purpose is to find those annotations in A that are after B. What that means is the start offset for an annotation from A must be after the end offset from an annotation in B. We of course have to also match on the document id. We ultimately return the A annotations that meet this criteria. A distance operator can also be optionally specified. This would require an A annotation (startOffset) to occur n characters (or less) after the B annotation (endOffset). There is also the option of negating the query (think Not After) so that we return only A where it is not after B.
@@ -75,6 +77,9 @@ The following functions are currently provided by AnnotationQuery. Since functio
 
 **Following**  -  Return the following sibling annotations for every annotation in the anchor Dataframe[AQAnnotations]. The following sibling annotations can optionally be required to be contained in a container Dataframe[AQAnnotations]. The return type of this function is different from other functions. Instead of returning a Dataframe[AQAnnotation] this function returns a Dataframe[(AQAnnotation,Array[AQAnnotation])].
 
+**TokensSpan**  -  Provides the ability to create a string from a list of tokens that are contained in a span. The specified tokenProperty is used to extract the values from the tokens when creating the string. For SCNLP, this tokenProperty could be values like 'orig', 'lemma', or 'pos'. The spans would typically be a SCNLP 'sentence' or could even be things like an OM 'ce:para'.  Returns a Dataframe[AQAnnotation] spans with 3 new properties all prefixed with the specified tokenProperty value followed by (ToksStr, ToksSpos, ToksEpos) The ToksStr property will be the concatenated string of token property values contained in the span. The ToksSPos and ToksEpos are properties that will help us determine the start/end offset for each of the individual tokens in the ToksStr. These helper properties are needed for the function RegexTokensSpan so we can generate accurate accurate start/end offsets based on the str file.
+
+**RegexTokensSpan**  -  Provides the ability to apply a regular expression to the concatenated string generated by TokensSpan. For the strings matching the regex, a Dataframe[AQAnnotations] will be returned.  The AQAnnotation will correspond to the offsets within the concatenated string containing the match.
 
 #### Concordancers
 
 
@@ -1,3 +1,4 @@
 pyspark==3.0.0
+regex
 psutil
 nose
@@ -13,7 +13,7 @@
 
 setup(
     name='AnnotationQueryPython-spark3',
-    version='1.0.5',
+    version='1.0.6',
     description='Python implementation for AnnotationQuery',
     long_description=readme,
     author='Darin McBeath',
 
@@ -121,8 +121,6 @@ def test_ContainedInList1(self):
                               properties=sortedResults[0][1][0].properties,
                               startOffset=sortedResults[0][1][0].startOffset))
 
-                             
-
     # Test Before
     def test_Before1(self):
         self.assertEquals(47,Before(FilterProperty(self.annots, "orig", "polynomial"), FilterProperty(self.annots, "orig", "function")).count())       
@@ -307,5 +305,32 @@ def test_Following2(self):
                               properties=sortedResults[0][1][2].properties,
                               startOffset=sortedResults[0][1][2].startOffset))    
 
+    def test_TokensSpan(self):
+        result = TokensSpan(FilterType(self.annots,"word"),FilterType(self.annots,"sentence"),"orig").collect()
+        sortedResults =  sorted(result, key=lambda rec: (rec.docId,rec.startOffset))   
+        self.assertEquals(128,len(sortedResults)) 
+        self.assertEquals(Row(annotId=1, annotSet='ge', annotType='sentence', docId='S0022314X13001777', endOffset=18607, properties={'origToksEpos' : '5|18551 14|18560 17|18563 28|18574 41|18587 44|18590 48|18594 57|18603 61|18607', 'origToksSpos' : '0|18546 6|18552 15|18561 18|18564 29|18575 42|18588 45|18591 49|18595 58|18604', 'origToksStr': 'Sylow p-groups of polynomial permutations on the integers mod'},startOffset=18546),
+                          Row(annotId=sortedResults[0].annotId,
+                              annotSet=sortedResults[0].annotSet,
+                              annotType=sortedResults[0].annotType,
+                              docId=sortedResults[0].docId,
+                              endOffset=sortedResults[0].endOffset,
+                              properties=sortedResults[0].properties,
+                              startOffset=sortedResults[0].startOffset))    
+
+    def test_RegexTokensSpan(self):
+        tokensSpan = TokensSpan(FilterType(self.annots,"word"),FilterType(self.annots,"sentence"),"orig")
+        result = RegexTokensSpan(tokensSpan,"orig",r"(?i)(?<= |^)poly[a-z]+ perm[a-z]+(?= |$$)","newSet","newType").collect()
+        sortedResults =  sorted(result, key=lambda rec: (rec.docId,rec.startOffset)) 
+        self.assertEquals(18,len(sortedResults))
+        self.assertEquals(Row(annotId=1, annotSet='newSet', annotType='newType', docId='S0022314X13001777', endOffset=18587, properties={'origMatch': 'polynomial permutations'}, startOffset=18564),
+                         Row(annotId=sortedResults[0].annotId,
+                              annotSet=sortedResults[0].annotSet,
+                              annotType=sortedResults[0].annotType,
+                              docId=sortedResults[0].docId,
+                              endOffset=sortedResults[0].endOffset,
+                              properties=sortedResults[0].properties,
+                              startOffset=sortedResults[0].startOffset))
+            
 if __name__ == "__main__":
     unittest.main()
-Original file line number
+Diff line change
@@ @@ -1,3 +1,4 @@ @@
 pyspark==3.0.0
 +regex
 psutil
 nose