Skip to content

Commit b672aeb

Browse files
committed
Added support for TokensSpan and RegexTokensSpan
1 parent 85c7f12 commit b672aeb

10 files changed

+181
-5
lines changed

AQPython/Query.py

+146-1
Original file line numberDiff line numberDiff line change
@@ -298,7 +298,8 @@ def ContainedInList(left, right):
298298

299299
def containedAQ(rec):
300300
# Sort the contained annotations
301-
srecs = sorted(rec[1], key=lambda x: (-1 if x.LendOffset == None else x.LendOffset),reverse=True)
301+
#srecs = sorted(rec[1], key=lambda x: (-1 if x.LendOffset == None else x.LendOffset),reverse=True)
302+
srecs = sorted(rec[1], key=lambda x: (1 if x.LstartOffset == None else x.LstartOffset),reverse=False)
302303

303304
# We can extract the key from the any 'right' entry in sorted recs (we will use the first one)
304305
key = Row(docId = srecs[0].RdocId,
@@ -895,3 +896,147 @@ def followingContainedAQ(rec):
895896

896897
else:
897898
return results
899+
900+
901+
902+
def TokensSpan(tokens, spans, tokenProperty):
903+
"""Provides the ability to create a string from a list of tokens that are contained in a span.
904+
905+
The specified tokenProperty is used to extract the values from the tokens when creating the string.
906+
For SCNLP, this tokenProperty could be values like 'orig', 'lemma', or 'pos'. The spans would typically be a SCNLP 'sentence' or could even be things like an OM 'ce:para'.
907+
908+
Args:
909+
tokens: Dataframe of AQAnnotations (which we will use to concatenate for the string)
910+
spans: Dataframe of AQAnnotations (identifies the start/end for the tokens to be used for the concatenated string)
911+
tokenProperty: The property field in the tokens to use for extracting the value for the concatenated string
912+
913+
Returns:
914+
Dataframe[AQAnnotation] spans with 3 new properties all prefixed with the specified tokenProperty value followed by (ToksStr, ToksSpos, ToksEpos) The ToksStr property will be the
915+
concatenated string of token property values contained in the span. The ToksSPos and ToksEpos are properties that will help us determine the start/end offset for each of the individual tokens in the ToksStr.
916+
These helper properties are needed for the function RegexTokensSpan so we can generate accurate accurate start/end offsets based on the str file.
917+
"""
918+
def process(rec):
919+
span = rec[0]
920+
tokens = rec[1]
921+
newProps = {}
922+
oldProps = span.properties
923+
for key in oldProps.keys():
924+
newProps[key] = oldProps[key]
925+
toksStr = []
926+
toksSpos = []
927+
toksEpos = []
928+
offset = 0
929+
for token in tokens:
930+
tokeStr = ""
931+
if (token.properties != None) and (tokenProperty in token.properties):
932+
tokStr = token.properties[tokenProperty]
933+
toksStr.append(tokStr)
934+
toksSpos.append((str(offset) + "|" + str(token.startOffset)))
935+
offset += len(tokStr)
936+
toksEpos.append((str(offset) + "|" + str(token.endOffset)))
937+
offset += 1
938+
newProps[tokenProperty + "ToksStr"] = " ".join(toksStr)
939+
newProps[tokenProperty + "ToksSpos"] = " ".join(toksSpos)
940+
newProps[tokenProperty + "ToksEpos"] = " ".join(toksEpos)
941+
942+
return Row(docId = span.docId,
943+
annotSet = span.annotSet,
944+
annotType = span.annotType,
945+
startOffset = span.startOffset,
946+
endOffset = span.endOffset,
947+
annotId = span.annotId,
948+
properties = newProps)
949+
950+
results = ContainedInList(tokens,spans).rdd.map(lambda rec: process(rec))
951+
return spark.createDataFrame(results.map(lambda x: x),AQSchema())
952+
953+
def RegexTokensSpan(tokensSpan, prop, regex, annotSet="",annotType="", annotProps={}):
954+
"""Provides the ability to apply a regular expression to the concatenated string generated by TokensSpan.
955+
956+
For the strings matching the regex,a Dataframe[AQAnnotations] will be returned.
957+
The AQAnnotation will correspond to the offsets within the concatenated string containing the match.
958+
959+
Args:
960+
tokensSpan: Datafrane of AQAnnotations (the annotations returned from the TokensSpan function)
961+
prop: the property name (orig, lemma, pos) that was used to generate the string for the span in TokensSpan
962+
regex: the regular expression to apply to the span
963+
annotSet: the value to assign to annotSet for the returned matched annotations (default will be the annotSet from the tokensSpan)
964+
annotType: the value to assign to annotType for the returned matched annotations (default will be the annotType from the tokensSpan)
965+
annotProps: the additional properties to append to the properties map for the returned matched annotations
966+
967+
Returns:
968+
Dataframe[AQAnnotation] for the strings matching the regex
969+
"""
970+
def process(partition,prop,regex,annotSet,annotType,annotProps):
971+
import regex as re
972+
import builtins as py_builtin
973+
974+
results = []
975+
annotId = 0
976+
pattern = re.compile(regex)
977+
978+
for rec in partition:
979+
if (rec.properties != None) and (prop+"ToksStr" in rec.properties):
980+
span = rec.properties[prop+"ToksStr"]
981+
for match in re.finditer(pattern, span):
982+
annotId += 1
983+
newAnnotSet = annotSet
984+
newAnnotType = annotType
985+
if (annotSet == ""):
986+
newAnnotSet = rec.annotSet
987+
if (annotType == ""):
988+
newAnnotType = rec.annotType
989+
props = {}
990+
oldProps = rec.properties
991+
for key in annotProps.keys():
992+
props[key] = annotProps[key]
993+
# start
994+
startPos = -1
995+
startPosLB = []
996+
for start in oldProps[prop+"ToksSpos"].split(" "):
997+
startToks = start.split("|")
998+
if int(startToks[0]) == match.start():
999+
startPos = int(startToks[1])
1000+
if int(startToks[0]) < match.start():
1001+
startPosLB.append(int(startToks[1]))
1002+
if startPos == -1:
1003+
startPos = py_builtin.max(startPosLB)
1004+
# end
1005+
endPos = -1
1006+
endPosLB = []
1007+
for end in oldProps[prop+"ToksEpos"].split(" "):
1008+
endToks = end.split("|")
1009+
if int(endToks[0]) == match.end():
1010+
endPos = int(endToks[1])
1011+
if int(endToks[0]) > match.end():
1012+
endPosLB.append(int(endToks[1]))
1013+
if endPos == -1:
1014+
endPos = py_builtin.min(endPosLB)
1015+
props[prop+"Match"] = span[match.start():match.end()]
1016+
# get the excludes from the span (but only include those contained in within the match)
1017+
for key in oldProps.keys():
1018+
if key == "excludes":
1019+
excludesLB = []
1020+
for exclude in oldProps[key].split("|"):
1021+
arr = exclude.split(",")
1022+
excludeStart = int(arr[3])
1023+
excludeEnd = int(arr[4])
1024+
if excludeStart >= startPos and excludeEnd <= endPos:
1025+
excludesLB.append(exclude)
1026+
if len(excludesLB):
1027+
props["excludes"] = "|".join(excludesLB)
1028+
1029+
annot = Row(docId = rec.docId,
1030+
annotSet = newAnnotSet,
1031+
annotType = newAnnotType,
1032+
startOffset = startPos,
1033+
endOffset = endPos,
1034+
annotId = annotId,
1035+
properties = props)
1036+
1037+
results.append(annot)
1038+
1039+
return iter(results)
1040+
1041+
results = tokensSpan.rdd.mapPartitions(lambda partition: process(partition,prop,regex,annotSet,annotType,annotProps))
1042+
return spark.createDataFrame(results.map(lambda x: x),AQSchema())

README.md

+6-1
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ We realize that you can't have a Dataframe[AQAnnotation] like you can with scala
3838

3939
#### Utilities
4040

41-
The GetAQAnnotation and GetCATAnnotation and utility classes have been developed to create an AQAnnotation from the archive format (CATAnnotation) and vise versa. When creating the AQAnnotation, the ampersand separated string of name-value pairs in the CATAnnotation other field is mapped to a Map in the AQAnnotation record. To minimize memory consumption and increase performance, you can specify which name-value pairs to include in the Map. For more details on the implementation, view the corresponding class for each function in the AQPython Utilities module. For usage examples, view the GetAQAnnotation and GetCATAnnotation classes in the test_utilities module.
41+
The GetAQAnnotation and GetCATAnnotation and utility classes have been developed to create an AQAnnotation from the archive format (CATAnnotation) and vise versa. When creating the AQAnnotation, the ampersand separated string of name-value pairs in the CATAnnotation other field is mapped to a Map in the AQAnnotation record. To minimize memory consumption and increase performance, you can specify which name-value pairs to include in the Map as well as which ones to decode or lower case. if you want all name-value pairs to be included in the map, simply specify a value of ["*"] for the parameter in the function.. For more details on the implementation, view the corresponding class for each function in the AQPython Utilities module. For usage examples, view the GetAQAnnotation and GetCATAnnotation classes in the test_utilities module.
4242

4343

4444
#### AnnotationQuery Functions
@@ -57,6 +57,8 @@ The following functions are currently provided by AnnotationQuery. Since functio
5757

5858
**ContainedIn** - Provide the ability to find annotations that are contained by another annotation. The input is 2 Dataframes of AQAnnotations. We will call them A and B. The purpose is to find those annotations in A that are contained in B. What that means is the start/end offset for an annotation from A must be contained by the start/end offset from an annotation in B. We of course have to also match on the document id. We ultimately return the contained annotations (A) that meet this criteria. There is also the option of negating the query (think Not Contains) so that we return only A where it is not contained in B.
5959

60+
**ContainedInList** - Provide the ability to find annotations that are contained by another annotation. The input is 2 Dataframes of AQAnnotations. We will call them A and B. The purpose is to find those annotations in A that are contained in B. What that means is the start/end offset for an annotation from A must be contained by the start/end offset from an annotation in B. We of course have to also match on the document id. We ultimately return a Dataframe with 2 fields where the first field is an annotation from B and the second field is an array of entries from A that are contained in the first entry.
61+
6062
**Before** - Provide the ability to find annotations that are before another annotation. The input is 2 Dataframes of AQAnnotations. We will call them A and B. The purpose is to find those annotations in A that are before B. What that means is the end offset for an annotation from A must be before the start offset from an annotation in B. We of course have to also match on the document id. We ultimately return the A annotations that meet this criteria. A distance operator can also be optionally specified. This would require an A annotation (endOffset) to occur n characters (or less) before the B annotation (startOffset). There is also the option of negating the query (think Not Before) so that we return only A where it is not before B.
6163

6264
**After** - Provide the ability to find annotations that are after another annotation. The input is 2 Dataframes of AQAnnotations. We will call them A and B. The purpose is to find those annotations in A that are after B. What that means is the start offset for an annotation from A must be after the end offset from an annotation in B. We of course have to also match on the document id. We ultimately return the A annotations that meet this criteria. A distance operator can also be optionally specified. This would require an A annotation (startOffset) to occur n characters (or less) after the B annotation (endOffset). There is also the option of negating the query (think Not After) so that we return only A where it is not after B.
@@ -75,6 +77,9 @@ The following functions are currently provided by AnnotationQuery. Since functio
7577

7678
**Following** - Return the following sibling annotations for every annotation in the anchor Dataframe[AQAnnotations]. The following sibling annotations can optionally be required to be contained in a container Dataframe[AQAnnotations]. The return type of this function is different from other functions. Instead of returning a Dataframe[AQAnnotation] this function returns a Dataframe[(AQAnnotation,Array[AQAnnotation])].
7779

80+
**TokensSpan** - Provides the ability to create a string from a list of tokens that are contained in a span. The specified tokenProperty is used to extract the values from the tokens when creating the string. For SCNLP, this tokenProperty could be values like 'orig', 'lemma', or 'pos'. The spans would typically be a SCNLP 'sentence' or could even be things like an OM 'ce:para'. Returns a Dataframe[AQAnnotation] spans with 3 new properties all prefixed with the specified tokenProperty value followed by (ToksStr, ToksSpos, ToksEpos) The ToksStr property will be the concatenated string of token property values contained in the span. The ToksSPos and ToksEpos are properties that will help us determine the start/end offset for each of the individual tokens in the ToksStr. These helper properties are needed for the function RegexTokensSpan so we can generate accurate accurate start/end offsets based on the str file.
81+
82+
**RegexTokensSpan** - Provides the ability to apply a regular expression to the concatenated string generated by TokensSpan. For the strings matching the regex, a Dataframe[AQAnnotations] will be returned. The AQAnnotation will correspond to the offsets within the concatenated string containing the match.
7883

7984
#### Concordancers
8085

-15.6 KB
Binary file not shown.
Binary file not shown.
-32.5 KB
Binary file not shown.
Binary file not shown.
Binary file not shown.

requirements.txt

+1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
pyspark==3.0.0
2+
regex
23
psutil
34
nose

setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313

1414
setup(
1515
name='AnnotationQueryPython-spark3',
16-
version='1.0.5',
16+
version='1.0.6',
1717
description='Python implementation for AnnotationQuery',
1818
long_description=readme,
1919
author='Darin McBeath',

tests/test_query.py

+27-2
Original file line numberDiff line numberDiff line change
@@ -121,8 +121,6 @@ def test_ContainedInList1(self):
121121
properties=sortedResults[0][1][0].properties,
122122
startOffset=sortedResults[0][1][0].startOffset))
123123

124-
125-
126124
# Test Before
127125
def test_Before1(self):
128126
self.assertEquals(47,Before(FilterProperty(self.annots, "orig", "polynomial"), FilterProperty(self.annots, "orig", "function")).count())
@@ -307,5 +305,32 @@ def test_Following2(self):
307305
properties=sortedResults[0][1][2].properties,
308306
startOffset=sortedResults[0][1][2].startOffset))
309307

308+
def test_TokensSpan(self):
309+
result = TokensSpan(FilterType(self.annots,"word"),FilterType(self.annots,"sentence"),"orig").collect()
310+
sortedResults = sorted(result, key=lambda rec: (rec.docId,rec.startOffset))
311+
self.assertEquals(128,len(sortedResults))
312+
self.assertEquals(Row(annotId=1, annotSet='ge', annotType='sentence', docId='S0022314X13001777', endOffset=18607, properties={'origToksEpos' : '5|18551 14|18560 17|18563 28|18574 41|18587 44|18590 48|18594 57|18603 61|18607', 'origToksSpos' : '0|18546 6|18552 15|18561 18|18564 29|18575 42|18588 45|18591 49|18595 58|18604', 'origToksStr': 'Sylow p-groups of polynomial permutations on the integers mod'},startOffset=18546),
313+
Row(annotId=sortedResults[0].annotId,
314+
annotSet=sortedResults[0].annotSet,
315+
annotType=sortedResults[0].annotType,
316+
docId=sortedResults[0].docId,
317+
endOffset=sortedResults[0].endOffset,
318+
properties=sortedResults[0].properties,
319+
startOffset=sortedResults[0].startOffset))
320+
321+
def test_RegexTokensSpan(self):
322+
tokensSpan = TokensSpan(FilterType(self.annots,"word"),FilterType(self.annots,"sentence"),"orig")
323+
result = RegexTokensSpan(tokensSpan,"orig",r"(?i)(?<= |^)poly[a-z]+ perm[a-z]+(?= |$$)","newSet","newType").collect()
324+
sortedResults = sorted(result, key=lambda rec: (rec.docId,rec.startOffset))
325+
self.assertEquals(18,len(sortedResults))
326+
self.assertEquals(Row(annotId=1, annotSet='newSet', annotType='newType', docId='S0022314X13001777', endOffset=18587, properties={'origMatch': 'polynomial permutations'}, startOffset=18564),
327+
Row(annotId=sortedResults[0].annotId,
328+
annotSet=sortedResults[0].annotSet,
329+
annotType=sortedResults[0].annotType,
330+
docId=sortedResults[0].docId,
331+
endOffset=sortedResults[0].endOffset,
332+
properties=sortedResults[0].properties,
333+
startOffset=sortedResults[0].startOffset))
334+
310335
if __name__ == "__main__":
311336
unittest.main()

0 commit comments

Comments
 (0)