Skip to content

Commit 2adbeab

Browse files
authored
Update pytorch-partial-tagger (#34)
* Bump pytorch-partial-tagger version * Update incompatible parts * Bump version * Fix the import order in tokenizer.py * Update requirements.txt
1 parent 8b151b7 commit 2adbeab

File tree

5 files changed

+15
-17
lines changed

5 files changed

+15
-17
lines changed

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ requires-python = ">=3.8"
88

99
[tool.poetry]
1010
name = "spacy-partial-tagger"
11-
version = "0.13.0"
11+
version = "0.14.0"
1212
description = "Sequence Tagger for Partially Annotated Dataset in spaCy"
1313
authors = ["yasufumi <yasufumi.taniguchi@gmail.com>"]
1414
license = "MIT"
@@ -27,7 +27,7 @@ transformers = {extras = ["ja"], version = "^4.25.1"}
2727
torch = "^2.0.1"
2828
spacy = {extras = ["transformers"], version = "^3.3.1"}
2929
spacy-alignments = "^0.8.5"
30-
pytorch-partial-tagger = "^0.1.5"
30+
pytorch-partial-tagger = "^0.1.6"
3131

3232
[tool.poetry.group.dev.dependencies]
3333
mypy = "^1.3.0"

requirements.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,9 +40,9 @@ pydantic==1.10.8 ; python_version >= "3.8" and python_version < "4.0"
4040
pyflakes==2.4.0 ; python_version >= "3.8" and python_version < "4.0"
4141
pytest-cov==3.0.0 ; python_version >= "3.8" and python_version < "4.0"
4242
pytest==7.3.1 ; python_version >= "3.8" and python_version < "4.0"
43-
pytorch-partial-tagger==0.1.5 ; python_version >= "3.8" and python_version < "4.0"
43+
pytorch-partial-tagger==0.1.6 ; python_version >= "3.8" and python_version < "4.0"
4444
pyyaml==6.0 ; python_version >= "3.8" and python_version < "4.0"
45-
regex==2023.5.5 ; python_version >= "3.8" and python_version < "4.0"
45+
regex==2023.6.3 ; python_version >= "3.8" and python_version < "4.0"
4646
requests==2.31.0 ; python_version >= "3.8" and python_version < "4.0"
4747
rhoknp==1.3.1 ; python_version >= "3.8" and python_version < "4.0"
4848
ruff==0.0.270 ; python_version >= "3.8" and python_version < "4.0"

spacy_partial_tagger/pipeline.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -52,9 +52,9 @@ def set_annotations(
5252
tokenized_texts = [doc.user_data["tokenized_text"] for doc in docs]
5353
tag_factory = TagFactory(tokenized_texts, self.label_set)
5454

55-
tags_collection = tag_factory.create_char_based_tags(tag_indices)
55+
tags_batch = tag_factory.create_char_based_tags(tag_indices)
5656

57-
for doc, tags in zip(docs, tags_collection):
57+
for doc, tags in zip(docs, tags_batch):
5858
ents = []
5959
for tag in tags:
6060
span = doc.char_span(tag.start, tag.start + tag.length, tag.label)
@@ -115,13 +115,13 @@ def get_loss(
115115
]
116116
tag_factory = TagFactory(tokenized_texts, self.label_set)
117117

118-
tags_collection = []
118+
tags_batch = []
119119
for example in examples:
120120
tags = tuple(
121121
create_tag(ent.start_char, len(ent.text), ent.label_)
122122
for ent in example.y.ents
123123
)
124-
tags_collection.append(CharBasedTags(tags, example.y.text))
124+
tags_batch.append(CharBasedTags(tags, example.y.text))
125125

126126
lengths = [text.num_tokens for text in tokenized_texts]
127127
max_length = max(lengths)
@@ -130,9 +130,7 @@ def get_loss(
130130
device=scores_pt.device,
131131
)
132132

133-
tag_bitmap = tag_factory.create_tag_bitmap(
134-
tuple(tags_collection), scores_pt.device
135-
)
133+
tag_bitmap = tag_factory.create_tag_bitmap(tuple(tags_batch), scores_pt.device)
136134

137135
loss = expected_entity_ratio_loss(
138136
scores_pt, tag_bitmap, mask, self.label_set.get_outside_index()

spacy_partial_tagger/tagger.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,15 +45,15 @@ def forward(
4545

4646
tokenizer: BaseTokenizer = model.attrs["tokenizer"]
4747

48-
tokenized_texts = tokenizer(tuple(doc.text for doc in X))
48+
text_batch = tokenizer(tuple(doc.text for doc in X))
4949

50-
for doc, text in zip(X, tokenized_texts.tokenized_texts):
50+
for doc, text in zip(X, text_batch.tokenized_texts):
5151
doc.user_data["tokenized_text"] = text
5252

5353
device = get_torch_default_device()
5454

5555
(log_potentials, tag_indices), backward = model.layers[0](
56-
[tokenized_texts.get_tagger_inputs(device), tokenized_texts.get_mask(device)],
56+
[text_batch.get_tagger_inputs(device), text_batch.get_mask(device)],
5757
is_train,
5858
)
5959

spacy_partial_tagger/tokenizer.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44
from partial_tagger.data import Span, TokenizedText
55
from partial_tagger.data.batch.text import (
66
BaseTokenizer,
7+
TextBatch,
78
Texts,
8-
TokenizedTexts,
99
TransformerTokenizer,
1010
)
1111
from transformers import AutoTokenizer
@@ -31,7 +31,7 @@ def __init__(
3131
}
3232
self.__tokenizer_args["return_offsets_mapping"] = True
3333

34-
def __call__(self, texts: Texts) -> TokenizedTexts:
34+
def __call__(self, texts: Texts) -> TextBatch:
3535
batch_encoding = self.__tokenizer(texts, **self.__tokenizer_args)
3636

3737
pad_token_id = self.__tokenizer.pad_token_id
@@ -62,7 +62,7 @@ def __call__(self, texts: Texts) -> TokenizedTexts:
6262
mask = torch.tensor(
6363
[[True] * length + [False] * (max_length - length) for length in lengths]
6464
)
65-
return TokenizedTexts(tuple(tokenized_texts), batch_encoding, mask)
65+
return TextBatch(tuple(tokenized_texts), batch_encoding, mask)
6666

6767

6868
def get_tokenizer(

0 commit comments

Comments
 (0)