@@ -129,11 +129,11 @@ class CoreNLPClient(RobustService):
129
129
DEFAULT_PROPERTIES = {}
130
130
DEFAULT_OUTPUT_FORMAT = "serialized"
131
131
132
- def __init__ (self , start_server = True ,
133
- endpoint = "http://localhost:9000" ,
134
- timeout = 5000 ,
132
+ def __init__ (self , start_server = True ,
133
+ endpoint = "http://localhost:9000" ,
134
+ timeout = 15000 ,
135
135
threads = 5 ,
136
- annotators = None ,
136
+ annotators = None ,
137
137
properties = None ,
138
138
output_format = None ,
139
139
stdout = sys .stdout ,
@@ -142,6 +142,8 @@ def __init__(self, start_server=True,
142
142
be_quiet = True ,
143
143
max_char_length = 100000
144
144
):
145
+ if isinstance (annotators , str ):
146
+ annotators = annotators .split ()
145
147
146
148
if start_server :
147
149
host , port = urlparse (endpoint ).netloc .split (":" )
@@ -161,6 +163,7 @@ def __init__(self, start_server=True,
161
163
162
164
super (CoreNLPClient , self ).__init__ (start_cmd , stop_cmd , endpoint ,
163
165
stdout , stderr , be_quiet )
166
+ self .timeout = timeout
164
167
self .default_annotators = annotators or self .DEFAULT_ANNOTATORS
165
168
self .default_properties = properties or self .DEFAULT_PROPERTIES
166
169
self .default_output_format = output_format or self .DEFAULT_OUTPUT_FORMAT
@@ -185,7 +188,8 @@ def _request(self, buf, properties):
185
188
186
189
r = requests .post (self .endpoint ,
187
190
params = {'properties' : str (properties )},
188
- data = buf , headers = {'content-type' : ctype })
191
+ data = buf , headers = {'content-type' : ctype },
192
+ timeout = (self .timeout * 2 )/ 1000 )
189
193
r .raise_for_status ()
190
194
return r
191
195
except requests .HTTPError as e :
@@ -250,43 +254,46 @@ def update(self, doc, annotators=None, properties=None):
250
254
return doc
251
255
252
256
def tokensregex (self , text , pattern , filter = False , to_words = False , annotators = None , properties = None ):
253
- # Error occurs unless put properties in params
254
- if properties is None :
255
- properties = self .default_properties
256
- properties .update ({
257
- 'annotators' : ',' .join (annotators or self .default_annotators ),
258
- 'inputFormat' : 'text' ,
259
- 'outputFormat' : 'serialized' ,
260
- 'serializer' : 'edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer'
261
- })
262
- matches = self .__regex ('/tokensregex' , text , pattern , filter , properties )
257
+ # this is required for some reason
258
+ matches = self .__regex ('/tokensregex' , text , pattern , filter , annotators , properties )
263
259
if to_words :
264
260
matches = regex_matches_to_indexed_words (matches )
265
261
return matches
266
262
267
- def semgrex (self , text , pattern , filter = False , to_words = False ):
268
- matches = self .__regex ('/semgrex' , text , pattern , filter )
263
+ def semgrex (self , text , pattern , filter = False , to_words = False , annotators = None , properties = None ):
264
+ matches = self .__regex ('/semgrex' , text , pattern , filter , annotators , properties )
269
265
if to_words :
270
266
matches = regex_matches_to_indexed_words (matches )
271
267
return matches
272
268
273
- def tregrex (self , text , pattern , filter = False ):
274
- return self .__regex ('/tregex' , text , pattern , filter )
269
+ def tregrex (self , text , pattern , filter = False , annotators = None , properties = None ):
270
+ return self .__regex ('/tregex' , text , pattern , filter , annotators , properties )
275
271
276
- def __regex (self , path , text , pattern , filter , properties ):
272
+ def __regex (self , path , text , pattern , filter , annotators = None , properties = None ):
277
273
"""Send a regex-related request to the CoreNLP server.
278
274
:param (str | unicode) path: the path for the regex endpoint
279
275
:param text: raw text for the CoreNLPServer to apply the regex
280
276
:param (str | unicode) pattern: regex pattern
281
277
:param (bool) filter: option to filter sentences that contain matches, if false returns matches
278
+ :param properties: option to filter sentences that contain matches, if false returns matches
282
279
:return: request result
283
280
"""
284
281
self .ensure_alive ()
282
+ if properties is None :
283
+ properties = self .default_properties
284
+ properties .update ({
285
+ 'annotators' : ',' .join (annotators or self .default_annotators ),
286
+ 'inputFormat' : 'text' ,
287
+ 'outputFormat' : self .default_output_format ,
288
+ 'serializer' : 'edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer'
289
+ })
290
+ elif "annotators" not in properties :
291
+ properties .update ({'annotators' : ',' .join (annotators or self .default_annotators )})
285
292
286
293
# HACK: For some stupid reason, CoreNLPServer will timeout if we
287
294
# need to annotate something from scratch. So, we need to call
288
295
# this to ensure that the _regex call doesn't timeout.
289
- # self.annotate(text)
296
+ self .annotate (text , properties = properties )
290
297
291
298
try :
292
299
# Error occurs unless put properties in params
@@ -304,7 +311,9 @@ def __regex(self, path, text, pattern, filter, properties):
304
311
'filter' : filter ,
305
312
'properties' : str (properties )
306
313
}, data = text ,
307
- headers = {'content-type' : ctype })
314
+ headers = {'content-type' : ctype },
315
+ timeout = (self .timeout * 2 )/ 1000 ,
316
+ )
308
317
r .raise_for_status ()
309
318
return json .loads (r .text )
310
319
except requests .HTTPError as e :
0 commit comments