Skip to content

Commit f046f4f

Browse files
committed
Fixed timeout issue for tokensregex
1 parent d57026c commit f046f4f

File tree

2 files changed

+33
-24
lines changed

2 files changed

+33
-24
lines changed

corenlp/client.py

Lines changed: 31 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -129,11 +129,11 @@ class CoreNLPClient(RobustService):
129129
DEFAULT_PROPERTIES = {}
130130
DEFAULT_OUTPUT_FORMAT = "serialized"
131131

132-
def __init__(self, start_server=True,
133-
endpoint="http://localhost:9000",
134-
timeout=5000,
132+
def __init__(self, start_server=True,
133+
endpoint="http://localhost:9000",
134+
timeout=15000,
135135
threads=5,
136-
annotators=None,
136+
annotators=None,
137137
properties=None,
138138
output_format=None,
139139
stdout=sys.stdout,
@@ -142,6 +142,8 @@ def __init__(self, start_server=True,
142142
be_quiet=True,
143143
max_char_length=100000
144144
):
145+
if isinstance(annotators, str):
146+
annotators = annotators.split()
145147

146148
if start_server:
147149
host, port = urlparse(endpoint).netloc.split(":")
@@ -161,6 +163,7 @@ def __init__(self, start_server=True,
161163

162164
super(CoreNLPClient, self).__init__(start_cmd, stop_cmd, endpoint,
163165
stdout, stderr, be_quiet)
166+
self.timeout = timeout
164167
self.default_annotators = annotators or self.DEFAULT_ANNOTATORS
165168
self.default_properties = properties or self.DEFAULT_PROPERTIES
166169
self.default_output_format = output_format or self.DEFAULT_OUTPUT_FORMAT
@@ -185,7 +188,8 @@ def _request(self, buf, properties):
185188

186189
r = requests.post(self.endpoint,
187190
params={'properties': str(properties)},
188-
data=buf, headers={'content-type': ctype})
191+
data=buf, headers={'content-type': ctype},
192+
timeout=(self.timeout*2)/1000)
189193
r.raise_for_status()
190194
return r
191195
except requests.HTTPError as e:
@@ -250,43 +254,46 @@ def update(self, doc, annotators=None, properties=None):
250254
return doc
251255

252256
def tokensregex(self, text, pattern, filter=False, to_words=False, annotators=None, properties=None):
253-
# Error occurs unless put properties in params
254-
if properties is None:
255-
properties = self.default_properties
256-
properties.update({
257-
'annotators': ','.join(annotators or self.default_annotators),
258-
'inputFormat': 'text',
259-
'outputFormat': 'serialized',
260-
'serializer': 'edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer'
261-
})
262-
matches = self.__regex('/tokensregex', text, pattern, filter, properties)
257+
# this is required for some reason
258+
matches = self.__regex('/tokensregex', text, pattern, filter, annotators, properties)
263259
if to_words:
264260
matches = regex_matches_to_indexed_words(matches)
265261
return matches
266262

267-
def semgrex(self, text, pattern, filter=False, to_words=False):
268-
matches = self.__regex('/semgrex', text, pattern, filter)
263+
def semgrex(self, text, pattern, filter=False, to_words=False, annotators=None, properties=None):
264+
matches = self.__regex('/semgrex', text, pattern, filter, annotators, properties)
269265
if to_words:
270266
matches = regex_matches_to_indexed_words(matches)
271267
return matches
272268

273-
def tregrex(self, text, pattern, filter=False):
274-
return self.__regex('/tregex', text, pattern, filter)
269+
def tregrex(self, text, pattern, filter=False, annotators=None, properties=None):
270+
return self.__regex('/tregex', text, pattern, filter, annotators, properties)
275271

276-
def __regex(self, path, text, pattern, filter, properties):
272+
def __regex(self, path, text, pattern, filter, annotators=None, properties=None):
277273
"""Send a regex-related request to the CoreNLP server.
278274
:param (str | unicode) path: the path for the regex endpoint
279275
:param text: raw text for the CoreNLPServer to apply the regex
280276
:param (str | unicode) pattern: regex pattern
281277
:param (bool) filter: option to filter sentences that contain matches, if false returns matches
278+
:param properties: option to filter sentences that contain matches, if false returns matches
282279
:return: request result
283280
"""
284281
self.ensure_alive()
282+
if properties is None:
283+
properties = self.default_properties
284+
properties.update({
285+
'annotators': ','.join(annotators or self.default_annotators),
286+
'inputFormat': 'text',
287+
'outputFormat': self.default_output_format,
288+
'serializer': 'edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer'
289+
})
290+
elif "annotators" not in properties:
291+
properties.update({'annotators': ','.join(annotators or self.default_annotators)})
285292

286293
# HACK: For some stupid reason, CoreNLPServer will timeout if we
287294
# need to annotate something from scratch. So, we need to call
288295
# this to ensure that the _regex call doesn't timeout.
289-
# self.annotate(text)
296+
self.annotate(text, properties=properties)
290297

291298
try:
292299
# Error occurs unless put properties in params
@@ -304,7 +311,9 @@ def __regex(self, path, text, pattern, filter, properties):
304311
'filter': filter,
305312
'properties': str(properties)
306313
}, data=text,
307-
headers={'content-type': ctype})
314+
headers={'content-type': ctype},
315+
timeout=(self.timeout*2)/1000,
316+
)
308317
r.raise_for_status()
309318
return json.loads(r.text)
310319
except requests.HTTPError as e:

tests/test_client.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ def test_update():
2323
assert corenlp.to_text(ann.sentence[0]) == TEXT[:-1]
2424

2525
def test_tokensregex():
26-
with corenlp.CoreNLPClient(annotators='tokenize ssplit ner depparse'.split()) as client:
26+
with corenlp.CoreNLPClient(annotators='tokenize ssplit ner depparse'.split(), timeout=60000) as client:
2727
# Example pattern from: https://nlp.stanford.edu/software/tokensregex.shtml
2828
pattern = '([ner: PERSON]+) /wrote/ /an?/ []{0,3} /sentence|article/'
2929
matches = client.tokensregex(TEXT, pattern)
@@ -44,7 +44,7 @@ def test_tokensregex():
4444
},]}
4545

4646
def test_semgrex():
47-
with corenlp.CoreNLPClient(annotators='tokenize ssplit pos lemma ner depparse'.split()) as client:
47+
with corenlp.CoreNLPClient(annotators='tokenize ssplit pos lemma ner depparse'.split(), timeout=60000) as client:
4848
pattern = '{word:wrote} >nsubj {}=subject >dobj {}=object'
4949
matches = client.semgrex(TEXT, pattern, to_words=True)
5050
assert matches == [

0 commit comments

Comments
 (0)