Skip to content

Commit 09bbd1d

Browse files
author
Arun Tejasvi Chaganty
committed
Ported tests to simpler py.test framework
1 parent 6755b5f commit 09bbd1d

File tree

1 file changed

+114
-115
lines changed

1 file changed

+114
-115
lines changed

tests/test_read.py

Lines changed: 114 additions & 115 deletions
Original file line numberDiff line numberDiff line change
@@ -8,121 +8,120 @@
88
"""
99

1010
import os
11-
import unittest
11+
from pytest import fixture
1212
from corenlp_protobuf import Document, Sentence, Token, DependencyGraph, CorefChain
1313
from corenlp_protobuf import parseFromDelimitedString, to_text
1414

15-
class TestProtobuf(unittest.TestCase):
16-
def setUp(self):
17-
self.text = "Chris wrote a simple sentence that he parsed with Stanford CoreNLP.\n"
18-
test_data = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data', 'test.dat')
19-
with open(test_data, 'rb') as f:
20-
self.buf = f.read()
21-
22-
self.doc = Document()
23-
parseFromDelimitedString(self.doc, self.buf)
24-
25-
def test_parse_protobuf(self):
26-
self.assertEqual(4239, self.doc.ByteSize(), "Could not read input correctly")
27-
28-
def test_document_text(self):
29-
self.assertEqual(self.text, self.doc.text)
30-
31-
def test_sentences(self):
32-
self.assertEqual(1, len(self.doc.sentence))
33-
34-
sentence = self.doc.sentence[0]
35-
self.assertTrue(isinstance(sentence, Sentence))
36-
self.assertEqual(67, sentence.characterOffsetEnd - sentence.characterOffsetBegin) # Sentence length
37-
self.assertEqual('', sentence.text) # Note that the sentence text should actually be recovered from the tokens.
38-
self.assertEqual(self.text[:-1], to_text(sentence)) # Note that the sentence text should actually be recovered from the tokens.
39-
40-
def test_tokens(self):
41-
sentence = self.doc.sentence[0]
42-
tokens = sentence.token
43-
self.assertEqual(12, len(tokens))
44-
self.assertTrue(isinstance(tokens[0], Token))
45-
46-
# Word
47-
words = "Chris wrote a simple sentence that he parsed with Stanford CoreNLP .".split()
48-
words_ = [t.word for t in tokens]
49-
self.assertEqual(words, words_)
50-
51-
# Lemma
52-
lemmas = "Chris write a simple sentence that he parse with Stanford CoreNLP .".split()
53-
lemmas_ = [t.lemma for t in tokens]
54-
self.assertEqual(lemmas, lemmas_)
55-
56-
# POS
57-
pos = "NNP VBD DT JJ NN IN PRP VBD IN NNP NNP .".split()
58-
pos_ = [t.pos for t in tokens]
59-
self.assertEqual(pos, pos_)
60-
61-
# NER
62-
ner = "PERSON O O O O O O O O ORGANIZATION O O".split()
63-
ner_ = [t.ner for t in tokens]
64-
self.assertEqual(ner, ner_)
65-
66-
# character offsets
67-
begin = [int(i) for i in "0 6 12 14 21 30 35 38 45 50 59 66".split()]
68-
end = [int(i) for i in "5 11 13 20 29 34 37 44 49 58 66 67".split()]
69-
begin_ = [t.beginChar for t in tokens]
70-
end_ = [t.endChar for t in tokens]
71-
self.assertEqual(begin, begin_)
72-
self.assertEqual(end, end_)
73-
74-
def test_dependency_parse(self):
75-
"""
76-
Extract the dependency parse from the annotation.
77-
"""
78-
sentence = self.doc.sentence[0]
79-
80-
# You can choose from the following types of dependencies.
81-
# In general, you'll want enhancedPlusPlus
82-
self.assertTrue(sentence.basicDependencies.ByteSize() > 0)
83-
self.assertTrue(sentence.enhancedDependencies.ByteSize() > 0)
84-
self.assertTrue(sentence.enhancedPlusPlusDependencies.ByteSize() > 0)
85-
86-
tree = sentence.enhancedPlusPlusDependencies
87-
self.assertTrue(isinstance(tree, DependencyGraph))
88-
# Indices are 1-indexd with 0 being the "pseudo root"
89-
self.assertEqual([2], tree.root) # 'wrote' is the root.
90-
# There are as many nodes as there are tokens.
91-
self.assertEqual(len(sentence.token), len(tree.node))
92-
93-
# Enhanced++ depdencies often contain additional edges and are
94-
# not trees -- here, 'parsed' would also have an edge to
95-
# 'sentence'
96-
self.assertEqual(12, len(tree.edge))
97-
98-
# This edge goes from "wrote" to "Chirs"
99-
edge = tree.edge[0]
100-
self.assertEqual(2, edge.source)
101-
self.assertEqual(1, edge.target)
102-
self.assertEqual("nsubj", edge.dep)
103-
104-
def test_coref_chain(self):
105-
"""
106-
Extract the corefence chains from the annotation.
107-
"""
108-
# Coreference chains span sentences and are stored in the
109-
# document.
110-
chains = self.doc.corefChain
111-
112-
# In this document there is 1 chain with Chris and he.
113-
self.assertEqual(1, len(chains))
114-
chain = chains[0]
115-
self.assertTrue(isinstance(chain, CorefChain))
116-
self.assertEqual(0, chain.mention[0].beginIndex) # Starts at token 0 == 'Chris'
117-
self.assertEqual(1, chain.mention[0].endIndex)
118-
self.assertEqual("MALE", chain.mention[0].gender)
119-
120-
self.assertEqual(6, chain.mention[1].beginIndex) # Starts at token 6 == 'he'
121-
self.assertEqual(7, chain.mention[1].endIndex)
122-
self.assertEqual("MALE", chain.mention[1].gender)
123-
124-
self.assertEqual(0, chain.representative) # The head of the chain is 'Chris'
125-
126-
127-
if __name__ == "__main__":
128-
unittest.main()
15+
16+
# Thext that was annotated
17+
TEXT = "Chris wrote a simple sentence that he parsed with Stanford CoreNLP.\n"
18+
19+
@fixture
20+
def doc_pb():
21+
test_data = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data', 'test.dat')
22+
with open(test_data, 'rb') as f:
23+
buf = f.read()
24+
doc = Document()
25+
parseFromDelimitedString(doc, buf)
26+
return doc
27+
28+
def test_parse_protobuf(doc_pb):
29+
assert doc_pb.ByteSize() == 4239
30+
31+
def test_document_text(doc_pb):
32+
assert doc_pb.text == TEXT
33+
34+
def test_sentences(doc_pb):
35+
assert len(doc_pb.sentence) == 1
36+
37+
sentence = doc_pb.sentence[0]
38+
assert isinstance(sentence, Sentence)
39+
assert sentence.characterOffsetEnd - sentence.characterOffsetBegin # Sentence length == 67
40+
assert sentence.text == '' # Note that the sentence text should actually be recovered from the tokens.
41+
assert to_text(sentence) == TEXT[:-1] # Note that the sentence text should actually be recovered from the tokens.
42+
43+
def test_tokens(doc_pb):
44+
sentence = doc_pb.sentence[0]
45+
tokens = sentence.token
46+
assert len(tokens) == 12
47+
assert isinstance(tokens[0], Token)
48+
49+
# Word
50+
words = "Chris wrote a simple sentence that he parsed with Stanford CoreNLP .".split()
51+
words_ = [t.word for t in tokens]
52+
assert words_ == words
53+
54+
# Lemma
55+
lemmas = "Chris write a simple sentence that he parse with Stanford CoreNLP .".split()
56+
lemmas_ = [t.lemma for t in tokens]
57+
assert lemmas_ == lemmas
58+
59+
# POS
60+
pos = "NNP VBD DT JJ NN IN PRP VBD IN NNP NNP .".split()
61+
pos_ = [t.pos for t in tokens]
62+
assert pos_ == pos
63+
64+
# NER
65+
ner = "PERSON O O O O O O O O ORGANIZATION O O".split()
66+
ner_ = [t.ner for t in tokens]
67+
assert ner_ == ner
68+
69+
# character offsets
70+
begin = [int(i) for i in "0 6 12 14 21 30 35 38 45 50 59 66".split()]
71+
end = [int(i) for i in "5 11 13 20 29 34 37 44 49 58 66 67".split()]
72+
begin_ = [t.beginChar for t in tokens]
73+
end_ = [t.endChar for t in tokens]
74+
assert begin_ == begin
75+
assert end_ == end
76+
77+
def test_dependency_parse(doc_pb):
78+
"""
79+
Extract the dependency parse from the annotation.
80+
"""
81+
sentence = doc_pb.sentence[0]
82+
83+
# You can choose from the following types of dependencies.
84+
# In general, you'll want enhancedPlusPlus
85+
assert sentence.basicDependencies.ByteSize() > 0
86+
assert sentence.enhancedDependencies.ByteSize() > 0
87+
assert sentence.enhancedPlusPlusDependencies.ByteSize() > 0
88+
89+
tree = sentence.enhancedPlusPlusDependencies
90+
isinstance(tree, DependencyGraph)
91+
# Indices are 1-indexd with 0 being the "pseudo root"
92+
assert tree.root # 'wrote' is the root. == [2]
93+
# There are as many nodes as there are tokens.
94+
assert len(tree.node) == len(sentence.token)
95+
96+
# Enhanced++ depdencies often contain additional edges and are
97+
# not trees -- here, 'parsed' would also have an edge to
98+
# 'sentence'
99+
assert len(tree.edge) == 12
100+
101+
# This edge goes from "wrote" to "Chirs"
102+
edge = tree.edge[0]
103+
assert edge.source == 2
104+
assert edge.target == 1
105+
assert edge.dep == "nsubj"
106+
107+
def test_coref_chain(doc_pb):
108+
"""
109+
Extract the corefence chains from the annotation.
110+
"""
111+
# Coreference chains span sentences and are stored in the
112+
# document.
113+
chains = doc_pb.corefChain
114+
115+
# In this document there is 1 chain with Chris and he.
116+
assert len(chains) == 1
117+
chain = chains[0]
118+
assert isinstance(chain, CorefChain)
119+
assert chain.mention[0].beginIndex == 0 # 'Chris'
120+
assert chain.mention[0].endIndex == 1
121+
assert chain.mention[0].gender == "MALE"
122+
123+
assert chain.mention[1].beginIndex == 6 # 'he'
124+
assert chain.mention[1].endIndex == 7
125+
assert chain.mention[1].gender == "MALE"
126+
127+
assert chain.representative == 0 # The head of the chain is 'Chris'

0 commit comments

Comments
 (0)