Skip to content

Commit 85c7f12

Browse files
committed
Added new function ContainedInList and enhanced Concordancer
Now have the ability to highlight in a variety of colors with Concordancer.
1 parent 2223d22 commit 85c7f12

File tree

4 files changed

+93
-4
lines changed

4 files changed

+93
-4
lines changed

AQPython/Concordancers.py

+7-3
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ def _buildHTML(text):
2929
return "<html><body><table border='1' style='font-family: monospace;table-layout: fixed;'>" + text + "</table></body></html>"
3030

3131

32-
def Concordancer(results, textMnt, nrows=10, offset=0, highlightAnnotations=None):
32+
def Concordancer(results, textMnt, nrows=10, offset=0, highlightAnnotations=None, colorPropertyKey="", colorMap={}):
3333
"""Output HTML for the text identified by the AQAnnotation and highlight in 'red' the text that was ignored (excluded).
3434
3535
Args:
@@ -38,7 +38,8 @@ def Concordancer(results, textMnt, nrows=10, offset=0, highlightAnnotations=None
3838
nrows: Number of results to display
3939
offset: Number of characters before/after each annotation in results to display
4040
highlightAnnotations: Dataframe of AQAnnotations that you would like to highlight in the results
41-
41+
colorPropertyKey: Key in the property map of highlightAnnotations to get the value for the color lookup in the specified colorMap
42+
colorMap: Map the colorPropertyKey value to the specified color in the Map. Default is blue when not found.
4243
Returns:
4344
HTML
4445
@@ -74,7 +75,10 @@ def Concordancer(results, textMnt, nrows=10, offset=0, highlightAnnotations=None
7475
if highlightTokens != None:
7576
hlTokens = [ hlToken for hlToken in highlightTokens if hlToken.docId == annot.docId and hlToken.startOffset >= annot.startOffset and hlToken.endOffset <= annot.endOffset ]
7677
for hlToken in hlTokens:
77-
hlToks.append((hlToken.startOffset,hlToken.startOffset,"hl", "<font color='blue'>"))
78+
hlColor = 'blue'
79+
if (hlToken.properties != None) and (colorPropertyKey in hlToken.properties):
80+
hlColor = colorMap.get(hlToken.properties[colorPropertyKey], 'blue')
81+
hlToks.append((hlToken.startOffset,hlToken.startOffset,"hl", "<font color='" + hlColor + "'>"))
7882
hlToks.append((hlToken.endOffset,hlToken.endOffset,"hl", "</font>"))
7983

8084
# Process the excludeTokens

AQPython/Query.py

+60
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,66 @@ def ContainedIn(left, right, limit=0, negate=False):
280280
return results
281281

282282

283+
def ContainedInList(left, right):
284+
"""Provide the ability to find annotations that are contained by another annotation.
285+
The input is 2 Dataframes of AQAnnotations. We will call them A and B.
286+
The purpose is to find those annotations in A that are contained in B. What that means is the start/end offset for an annotation from A must be contained by the start/end offset from an annotation in B.
287+
We of course have to also match on the document id.
288+
We ultimately return a Dataframe with 2 fields where the first field is an annotation from B and the second field is an array of entries from A
289+
that are contained in the first entry.
290+
291+
Args:
292+
left: Dataframe of AQAnnotations, the ones we will return (as a list) if they are contained in AQAnnotations from 'right'.
293+
right: Dataframe of AQAnnotations, the ones we are looking to see if they contain AQAnnotations from 'left'.
294+
295+
Returns:
296+
Dataframe of (AQAnnotations,Array[AQAnnotations])
297+
"""
298+
299+
def containedAQ(rec):
300+
# Sort the contained annotations
301+
srecs = sorted(rec[1], key=lambda x: (-1 if x.LendOffset == None else x.LendOffset),reverse=True)
302+
303+
# We can extract the key from the any 'right' entry in sorted recs (we will use the first one)
304+
key = Row(docId = srecs[0].RdocId,
305+
annotSet = srecs[0].RannotSet,
306+
annotType = srecs[0].RannotType,
307+
startOffset = int(srecs[0].RstartOffset),
308+
endOffset = int(srecs[0].RendOffset),
309+
annotId = int(srecs[0].RannotId),
310+
properties = srecs[0].Rproperties)
311+
312+
# Construct the array
313+
values = []
314+
for rec in srecs:
315+
if rec.LdocId != None:
316+
values.append(Row(docId = rec.LdocId,
317+
annotSet = rec.LannotSet,
318+
annotType = rec.LannotType,
319+
startOffset = int(rec.LstartOffset),
320+
endOffset = int(rec.LendOffset),
321+
annotId = int(rec.LannotId),
322+
properties = rec.Lproperties))
323+
return(key,values)
324+
325+
l = left.select("annotId","annotSet","annotType","docId","endOffset","properties","startOffset").toDF("LannotId","LannotSet","LannotType","LdocId","LendOffset","Lproperties","LstartOffset")
326+
r = right.select("annotId","annotSet","annotType","docId","endOffset","properties","startOffset").toDF("RannotId","RannotSet","RannotType","RdocId","RendOffset","Rproperties","RstartOffset")
327+
328+
results = l.join(r,
329+
((col("LdocId") == col("RdocId")) &
330+
(col("LstartOffset") >= col("RstartOffset")) &
331+
(col("LendOffset") <= col("RendOffset")) &
332+
(~((col("LannotSet") == col("RannotSet")) &
333+
(col("LannotType") == col("RannotType")) &
334+
(col("LstartOffset") == col("RstartOffset")) &
335+
(col("LendOffset") == col("RendOffset")))))) \
336+
.rdd \
337+
.groupBy(lambda x: (x["RdocId"],x["RstartOffset"],x["RendOffset"])) \
338+
.map(lambda rec: containedAQ(rec))
339+
340+
return spark.createDataFrame(results.map(lambda x: x),AQSchemaList())
341+
342+
283343
def Before(left, right, dist=sys.maxsize , limit=0, negate=False):
284344
"""Provide the ability to find annotations that are before another annotation.
285345

setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313

1414
setup(
1515
name='AnnotationQueryPython-spark3',
16-
version='1.0.4',
16+
version='1.0.5',
1717
description='Python implementation for AnnotationQuery',
1818
long_description=readme,
1919
author='Darin McBeath',

tests/test_query.py

+25
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,31 @@ def test_ContainedIn1(self):
9898
def test_ContainedIn2(self):
9999
self.assertEquals(2,ContainedIn(FilterType(self.annots, "sentence"), FilterType(self.annots, "ce:para"), negate = True).count())
100100

101+
# Test ContainedInList
102+
def test_ContainedInList1(self):
103+
result = ContainedInList(FilterProperty(self.annots,'orig','polynomial'),FilterType(self.annots, 'sentence')) \
104+
.collect()
105+
sortedResults = sorted(result, key=lambda tup: (tup[0]["startOffset"],tup[0]["endOffset"]))
106+
self.assertEquals(1, len(sortedResults[0][1]))
107+
self.assertEquals(Row(annotId=1, annotSet='ge', annotType='sentence', docId='S0022314X13001777', endOffset=18607, properties={}, startOffset=18546),
108+
Row(annotId=sortedResults[0][0].annotId,
109+
annotSet=sortedResults[0][0].annotSet,
110+
annotType=sortedResults[0][0].annotType,
111+
docId=sortedResults[0][0].docId,
112+
endOffset=sortedResults[0][0].endOffset,
113+
properties=sortedResults[0][0].properties,
114+
startOffset=sortedResults[0][0].startOffset))
115+
self.assertEquals(Row(annotId=7, annotSet='ge', annotType='word', docId='S0022314X13001777', endOffset=18574, properties={'lemma': 'polynomial', 'orig': 'polynomial', 'pos': 'jj'}, startOffset=18564),
116+
Row(annotId=sortedResults[0][1][0].annotId,
117+
annotSet=sortedResults[0][1][0].annotSet,
118+
annotType=sortedResults[0][1][0].annotType,
119+
docId=sortedResults[0][1][0].docId,
120+
endOffset=sortedResults[0][1][0].endOffset,
121+
properties=sortedResults[0][1][0].properties,
122+
startOffset=sortedResults[0][1][0].startOffset))
123+
124+
125+
101126
# Test Before
102127
def test_Before1(self):
103128
self.assertEquals(47,Before(FilterProperty(self.annots, "orig", "polynomial"), FilterProperty(self.annots, "orig", "function")).count())

0 commit comments

Comments
 (0)