9
9
10
10
import os
11
11
from pytest import fixture
12
- from corenlp_protobuf import Document , Sentence , Token , DependencyGraph , CorefChain
12
+ from corenlp_protobuf import Document , Sentence , Token , DependencyGraph ,\
13
+ CorefChain
13
14
from corenlp_protobuf import parseFromDelimitedString , to_text
14
15
15
16
16
17
# Thext that was annotated
17
18
TEXT = "Chris wrote a simple sentence that he parsed with Stanford CoreNLP.\n "
18
19
20
+
19
21
@fixture
20
22
def doc_pb ():
21
- test_data = os .path .join (os .path .dirname (os .path .abspath (__file__ )), 'data' , 'test.dat' )
23
+ test_dir = os .path .dirname (os .path .abspath (__file__ ))
24
+ test_data = os .path .join (test_dir , 'data' , 'test.dat' )
22
25
with open (test_data , 'rb' ) as f :
23
26
buf = f .read ()
24
27
doc = Document ()
25
28
parseFromDelimitedString (doc , buf )
26
29
return doc
27
30
31
+
28
32
def test_parse_protobuf (doc_pb ):
29
33
assert doc_pb .ByteSize () == 4239
30
34
35
+
31
36
def test_document_text (doc_pb ):
32
37
assert doc_pb .text == TEXT
33
38
39
+
34
40
def test_sentences (doc_pb ):
35
41
assert len (doc_pb .sentence ) == 1
36
42
37
43
sentence = doc_pb .sentence [0 ]
38
44
assert isinstance (sentence , Sentence )
39
- assert sentence .characterOffsetEnd - sentence .characterOffsetBegin # Sentence length == 67
40
- assert sentence .text == '' # Note that the sentence text should actually be recovered from the tokens.
41
- assert to_text (sentence ) == TEXT [:- 1 ] # Note that the sentence text should actually be recovered from the tokens.
45
+ # check sentence length
46
+ assert sentence .characterOffsetEnd - sentence .characterOffsetBegin == 67
47
+ # Note that the sentence text should actually be recovered from the tokens.
48
+ assert sentence .text == ''
49
+ assert to_text (sentence ) == TEXT [:- 1 ]
50
+
42
51
43
52
def test_tokens (doc_pb ):
44
53
sentence = doc_pb .sentence [0 ]
@@ -54,25 +63,26 @@ def test_tokens(doc_pb):
54
63
# Lemma
55
64
lemmas = "Chris write a simple sentence that he parse with Stanford CoreNLP ." .split ()
56
65
lemmas_ = [t .lemma for t in tokens ]
57
- assert lemmas_ == lemmas
66
+ assert lemmas_ == lemmas
58
67
59
68
# POS
60
69
pos = "NNP VBD DT JJ NN IN PRP VBD IN NNP NNP ." .split ()
61
70
pos_ = [t .pos for t in tokens ]
62
- assert pos_ == pos
71
+ assert pos_ == pos
63
72
64
73
# NER
65
74
ner = "PERSON O O O O O O O O ORGANIZATION O O" .split ()
66
75
ner_ = [t .ner for t in tokens ]
67
- assert ner_ == ner
76
+ assert ner_ == ner
68
77
69
78
# character offsets
70
79
begin = [int (i ) for i in "0 6 12 14 21 30 35 38 45 50 59 66" .split ()]
71
80
end = [int (i ) for i in "5 11 13 20 29 34 37 44 49 58 66 67" .split ()]
72
81
begin_ = [t .beginChar for t in tokens ]
73
82
end_ = [t .endChar for t in tokens ]
74
- assert begin_ == begin
75
- assert end_ == end
83
+ assert begin_ == begin
84
+ assert end_ == end
85
+
76
86
77
87
def test_dependency_parse (doc_pb ):
78
88
"""
@@ -89,7 +99,7 @@ def test_dependency_parse(doc_pb):
89
99
tree = sentence .enhancedPlusPlusDependencies
90
100
isinstance (tree , DependencyGraph )
91
101
# Indices are 1-indexd with 0 being the "pseudo root"
92
- assert tree .root # 'wrote' is the root. == [2]
102
+ assert tree .root # 'wrote' is the root. == [2]
93
103
# There are as many nodes as there are tokens.
94
104
assert len (tree .node ) == len (sentence .token )
95
105
@@ -104,6 +114,7 @@ def test_dependency_parse(doc_pb):
104
114
assert edge .target == 1
105
115
assert edge .dep == "nsubj"
106
116
117
+
107
118
def test_coref_chain (doc_pb ):
108
119
"""
109
120
Extract the corefence chains from the annotation.
@@ -113,15 +124,15 @@ def test_coref_chain(doc_pb):
113
124
chains = doc_pb .corefChain
114
125
115
126
# In this document there is 1 chain with Chris and he.
116
- assert len (chains ) == 1
127
+ assert len (chains ) == 1
117
128
chain = chains [0 ]
118
129
assert isinstance (chain , CorefChain )
119
- assert chain .mention [0 ].beginIndex == 0 # 'Chris'
130
+ assert chain .mention [0 ].beginIndex == 0 # 'Chris'
120
131
assert chain .mention [0 ].endIndex == 1
121
132
assert chain .mention [0 ].gender == "MALE"
122
133
123
- assert chain .mention [1 ].beginIndex == 6 # 'he'
134
+ assert chain .mention [1 ].beginIndex == 6 # 'he'
124
135
assert chain .mention [1 ].endIndex == 7
125
136
assert chain .mention [1 ].gender == "MALE"
126
137
127
- assert chain .representative == 0 # The head of the chain is 'Chris'
138
+ assert chain .representative == 0 # Head of the chain is 'Chris'
0 commit comments