|
8 | 8 | """
|
9 | 9 |
|
10 | 10 | import os
|
11 |
| -import unittest |
| 11 | +from pytest import fixture |
12 | 12 | from corenlp_protobuf import Document, Sentence, Token, DependencyGraph, CorefChain
|
13 | 13 | from corenlp_protobuf import parseFromDelimitedString, to_text
|
14 | 14 |
|
15 |
| -class TestProtobuf(unittest.TestCase): |
16 |
| - def setUp(self): |
17 |
| - self.text = "Chris wrote a simple sentence that he parsed with Stanford CoreNLP.\n" |
18 |
| - test_data = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data', 'test.dat') |
19 |
| - with open(test_data, 'rb') as f: |
20 |
| - self.buf = f.read() |
21 |
| - |
22 |
| - self.doc = Document() |
23 |
| - parseFromDelimitedString(self.doc, self.buf) |
24 |
| - |
25 |
| - def test_parse_protobuf(self): |
26 |
| - self.assertEqual(4239, self.doc.ByteSize(), "Could not read input correctly") |
27 |
| - |
28 |
| - def test_document_text(self): |
29 |
| - self.assertEqual(self.text, self.doc.text) |
30 |
| - |
31 |
| - def test_sentences(self): |
32 |
| - self.assertEqual(1, len(self.doc.sentence)) |
33 |
| - |
34 |
| - sentence = self.doc.sentence[0] |
35 |
| - self.assertTrue(isinstance(sentence, Sentence)) |
36 |
| - self.assertEqual(67, sentence.characterOffsetEnd - sentence.characterOffsetBegin) # Sentence length |
37 |
| - self.assertEqual('', sentence.text) # Note that the sentence text should actually be recovered from the tokens. |
38 |
| - self.assertEqual(self.text[:-1], to_text(sentence)) # Note that the sentence text should actually be recovered from the tokens. |
39 |
| - |
40 |
| - def test_tokens(self): |
41 |
| - sentence = self.doc.sentence[0] |
42 |
| - tokens = sentence.token |
43 |
| - self.assertEqual(12, len(tokens)) |
44 |
| - self.assertTrue(isinstance(tokens[0], Token)) |
45 |
| - |
46 |
| - # Word |
47 |
| - words = "Chris wrote a simple sentence that he parsed with Stanford CoreNLP .".split() |
48 |
| - words_ = [t.word for t in tokens] |
49 |
| - self.assertEqual(words, words_) |
50 |
| - |
51 |
| - # Lemma |
52 |
| - lemmas = "Chris write a simple sentence that he parse with Stanford CoreNLP .".split() |
53 |
| - lemmas_ = [t.lemma for t in tokens] |
54 |
| - self.assertEqual(lemmas, lemmas_) |
55 |
| - |
56 |
| - # POS |
57 |
| - pos = "NNP VBD DT JJ NN IN PRP VBD IN NNP NNP .".split() |
58 |
| - pos_ = [t.pos for t in tokens] |
59 |
| - self.assertEqual(pos, pos_) |
60 |
| - |
61 |
| - # NER |
62 |
| - ner = "PERSON O O O O O O O O ORGANIZATION O O".split() |
63 |
| - ner_ = [t.ner for t in tokens] |
64 |
| - self.assertEqual(ner, ner_) |
65 |
| - |
66 |
| - # character offsets |
67 |
| - begin = [int(i) for i in "0 6 12 14 21 30 35 38 45 50 59 66".split()] |
68 |
| - end = [int(i) for i in "5 11 13 20 29 34 37 44 49 58 66 67".split()] |
69 |
| - begin_ = [t.beginChar for t in tokens] |
70 |
| - end_ = [t.endChar for t in tokens] |
71 |
| - self.assertEqual(begin, begin_) |
72 |
| - self.assertEqual(end, end_) |
73 |
| - |
74 |
| - def test_dependency_parse(self): |
75 |
| - """ |
76 |
| - Extract the dependency parse from the annotation. |
77 |
| - """ |
78 |
| - sentence = self.doc.sentence[0] |
79 |
| - |
80 |
| - # You can choose from the following types of dependencies. |
81 |
| - # In general, you'll want enhancedPlusPlus |
82 |
| - self.assertTrue(sentence.basicDependencies.ByteSize() > 0) |
83 |
| - self.assertTrue(sentence.enhancedDependencies.ByteSize() > 0) |
84 |
| - self.assertTrue(sentence.enhancedPlusPlusDependencies.ByteSize() > 0) |
85 |
| - |
86 |
| - tree = sentence.enhancedPlusPlusDependencies |
87 |
| - self.assertTrue(isinstance(tree, DependencyGraph)) |
88 |
| - # Indices are 1-indexd with 0 being the "pseudo root" |
89 |
| - self.assertEqual([2], tree.root) # 'wrote' is the root. |
90 |
| - # There are as many nodes as there are tokens. |
91 |
| - self.assertEqual(len(sentence.token), len(tree.node)) |
92 |
| - |
93 |
| - # Enhanced++ depdencies often contain additional edges and are |
94 |
| - # not trees -- here, 'parsed' would also have an edge to |
95 |
| - # 'sentence' |
96 |
| - self.assertEqual(12, len(tree.edge)) |
97 |
| - |
98 |
| - # This edge goes from "wrote" to "Chirs" |
99 |
| - edge = tree.edge[0] |
100 |
| - self.assertEqual(2, edge.source) |
101 |
| - self.assertEqual(1, edge.target) |
102 |
| - self.assertEqual("nsubj", edge.dep) |
103 |
| - |
104 |
| - def test_coref_chain(self): |
105 |
| - """ |
106 |
| - Extract the corefence chains from the annotation. |
107 |
| - """ |
108 |
| - # Coreference chains span sentences and are stored in the |
109 |
| - # document. |
110 |
| - chains = self.doc.corefChain |
111 |
| - |
112 |
| - # In this document there is 1 chain with Chris and he. |
113 |
| - self.assertEqual(1, len(chains)) |
114 |
| - chain = chains[0] |
115 |
| - self.assertTrue(isinstance(chain, CorefChain)) |
116 |
| - self.assertEqual(0, chain.mention[0].beginIndex) # Starts at token 0 == 'Chris' |
117 |
| - self.assertEqual(1, chain.mention[0].endIndex) |
118 |
| - self.assertEqual("MALE", chain.mention[0].gender) |
119 |
| - |
120 |
| - self.assertEqual(6, chain.mention[1].beginIndex) # Starts at token 6 == 'he' |
121 |
| - self.assertEqual(7, chain.mention[1].endIndex) |
122 |
| - self.assertEqual("MALE", chain.mention[1].gender) |
123 |
| - |
124 |
| - self.assertEqual(0, chain.representative) # The head of the chain is 'Chris' |
125 |
| - |
126 |
| - |
127 |
| -if __name__ == "__main__": |
128 |
| - unittest.main() |
| 15 | + |
| 16 | +# Thext that was annotated |
| 17 | +TEXT = "Chris wrote a simple sentence that he parsed with Stanford CoreNLP.\n" |
| 18 | + |
| 19 | +@fixture |
| 20 | +def doc_pb(): |
| 21 | + test_data = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data', 'test.dat') |
| 22 | + with open(test_data, 'rb') as f: |
| 23 | + buf = f.read() |
| 24 | + doc = Document() |
| 25 | + parseFromDelimitedString(doc, buf) |
| 26 | + return doc |
| 27 | + |
| 28 | +def test_parse_protobuf(doc_pb): |
| 29 | + assert doc_pb.ByteSize() == 4239 |
| 30 | + |
| 31 | +def test_document_text(doc_pb): |
| 32 | + assert doc_pb.text == TEXT |
| 33 | + |
| 34 | +def test_sentences(doc_pb): |
| 35 | + assert len(doc_pb.sentence) == 1 |
| 36 | + |
| 37 | + sentence = doc_pb.sentence[0] |
| 38 | + assert isinstance(sentence, Sentence) |
| 39 | + assert sentence.characterOffsetEnd - sentence.characterOffsetBegin # Sentence length == 67 |
| 40 | + assert sentence.text == '' # Note that the sentence text should actually be recovered from the tokens. |
| 41 | + assert to_text(sentence) == TEXT[:-1] # Note that the sentence text should actually be recovered from the tokens. |
| 42 | + |
| 43 | +def test_tokens(doc_pb): |
| 44 | + sentence = doc_pb.sentence[0] |
| 45 | + tokens = sentence.token |
| 46 | + assert len(tokens) == 12 |
| 47 | + assert isinstance(tokens[0], Token) |
| 48 | + |
| 49 | + # Word |
| 50 | + words = "Chris wrote a simple sentence that he parsed with Stanford CoreNLP .".split() |
| 51 | + words_ = [t.word for t in tokens] |
| 52 | + assert words_ == words |
| 53 | + |
| 54 | + # Lemma |
| 55 | + lemmas = "Chris write a simple sentence that he parse with Stanford CoreNLP .".split() |
| 56 | + lemmas_ = [t.lemma for t in tokens] |
| 57 | + assert lemmas_ == lemmas |
| 58 | + |
| 59 | + # POS |
| 60 | + pos = "NNP VBD DT JJ NN IN PRP VBD IN NNP NNP .".split() |
| 61 | + pos_ = [t.pos for t in tokens] |
| 62 | + assert pos_ == pos |
| 63 | + |
| 64 | + # NER |
| 65 | + ner = "PERSON O O O O O O O O ORGANIZATION O O".split() |
| 66 | + ner_ = [t.ner for t in tokens] |
| 67 | + assert ner_ == ner |
| 68 | + |
| 69 | + # character offsets |
| 70 | + begin = [int(i) for i in "0 6 12 14 21 30 35 38 45 50 59 66".split()] |
| 71 | + end = [int(i) for i in "5 11 13 20 29 34 37 44 49 58 66 67".split()] |
| 72 | + begin_ = [t.beginChar for t in tokens] |
| 73 | + end_ = [t.endChar for t in tokens] |
| 74 | + assert begin_ == begin |
| 75 | + assert end_ == end |
| 76 | + |
| 77 | +def test_dependency_parse(doc_pb): |
| 78 | + """ |
| 79 | + Extract the dependency parse from the annotation. |
| 80 | + """ |
| 81 | + sentence = doc_pb.sentence[0] |
| 82 | + |
| 83 | + # You can choose from the following types of dependencies. |
| 84 | + # In general, you'll want enhancedPlusPlus |
| 85 | + assert sentence.basicDependencies.ByteSize() > 0 |
| 86 | + assert sentence.enhancedDependencies.ByteSize() > 0 |
| 87 | + assert sentence.enhancedPlusPlusDependencies.ByteSize() > 0 |
| 88 | + |
| 89 | + tree = sentence.enhancedPlusPlusDependencies |
| 90 | + isinstance(tree, DependencyGraph) |
| 91 | + # Indices are 1-indexd with 0 being the "pseudo root" |
| 92 | + assert tree.root # 'wrote' is the root. == [2] |
| 93 | + # There are as many nodes as there are tokens. |
| 94 | + assert len(tree.node) == len(sentence.token) |
| 95 | + |
| 96 | + # Enhanced++ depdencies often contain additional edges and are |
| 97 | + # not trees -- here, 'parsed' would also have an edge to |
| 98 | + # 'sentence' |
| 99 | + assert len(tree.edge) == 12 |
| 100 | + |
| 101 | + # This edge goes from "wrote" to "Chirs" |
| 102 | + edge = tree.edge[0] |
| 103 | + assert edge.source == 2 |
| 104 | + assert edge.target == 1 |
| 105 | + assert edge.dep == "nsubj" |
| 106 | + |
| 107 | +def test_coref_chain(doc_pb): |
| 108 | + """ |
| 109 | + Extract the corefence chains from the annotation. |
| 110 | + """ |
| 111 | + # Coreference chains span sentences and are stored in the |
| 112 | + # document. |
| 113 | + chains = doc_pb.corefChain |
| 114 | + |
| 115 | + # In this document there is 1 chain with Chris and he. |
| 116 | + assert len(chains) == 1 |
| 117 | + chain = chains[0] |
| 118 | + assert isinstance(chain, CorefChain) |
| 119 | + assert chain.mention[0].beginIndex == 0 # 'Chris' |
| 120 | + assert chain.mention[0].endIndex == 1 |
| 121 | + assert chain.mention[0].gender == "MALE" |
| 122 | + |
| 123 | + assert chain.mention[1].beginIndex == 6 # 'he' |
| 124 | + assert chain.mention[1].endIndex == 7 |
| 125 | + assert chain.mention[1].gender == "MALE" |
| 126 | + |
| 127 | + assert chain.representative == 0 # The head of the chain is 'Chris' |
0 commit comments