Skip to content

Commit ba7468e

Browse files
authored
Update requirements, fixing windows crashes (#13727)
* Re-enable pretraining test * Require thinc 8.3.4 * Reformat * Re-enable test
1 parent 311f7cc commit ba7468e

File tree

9 files changed

+71
-51
lines changed

9 files changed

+71
-51
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ requires = [
55
"cymem>=2.0.2,<2.1.0",
66
"preshed>=3.0.2,<3.1.0",
77
"murmurhash>=0.28.0,<1.1.0",
8-
"thinc>=8.3.0,<8.4.0",
8+
"thinc>=8.3.4,<8.4.0",
99
"numpy>=2.0.0,<3.0.0"
1010
]
1111
build-backend = "setuptools.build_meta"

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ spacy-legacy>=3.0.11,<3.1.0
33
spacy-loggers>=1.0.0,<2.0.0
44
cymem>=2.0.2,<2.1.0
55
preshed>=3.0.2,<3.1.0
6-
thinc>=8.3.0,<8.4.0
6+
thinc>=8.3.4,<8.4.0
77
ml_datasets>=0.2.0,<0.3.0
88
murmurhash>=0.28.0,<1.1.0
99
wasabi>=0.9.1,<1.2.0

setup.cfg

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,15 +41,15 @@ setup_requires =
4141
cymem>=2.0.2,<2.1.0
4242
preshed>=3.0.2,<3.1.0
4343
murmurhash>=0.28.0,<1.1.0
44-
thinc>=8.3.0,<8.4.0
44+
thinc>=8.3.4,<8.4.0
4545
install_requires =
4646
# Our libraries
4747
spacy-legacy>=3.0.11,<3.1.0
4848
spacy-loggers>=1.0.0,<2.0.0
4949
murmurhash>=0.28.0,<1.1.0
5050
cymem>=2.0.2,<2.1.0
5151
preshed>=3.0.2,<3.1.0
52-
thinc>=8.3.0,<8.4.0
52+
thinc>=8.3.4,<8.4.0
5353
wasabi>=0.9.1,<1.2.0
5454
srsly>=2.4.3,<3.0.0
5555
catalogue>=2.0.6,<2.1.0

spacy/tests/lang/ca/test_text.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
"""Test that longer and mixed texts are tokenized correctly."""
2+
23
import pytest
34

45

spacy/tests/lang/ja/test_lemmatization.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,13 @@
33

44
@pytest.mark.parametrize(
55
"word,lemma",
6-
[("新しく", "新しい"), ("赤く", "赤い"), ("すごく", "すごい"), ("いただきました", "いただく"), ("なった", "なる")],
6+
[
7+
("新しく", "新しい"),
8+
("赤く", "赤い"),
9+
("すごく", "すごい"),
10+
("いただきました", "いただく"),
11+
("なった", "なる"),
12+
],
713
)
814
def test_ja_lemmatizer_assigns(ja_tokenizer, word, lemma):
915
test_lemma = ja_tokenizer(word)[0].lemma_

spacy/tests/lang/ja/test_tokenizer.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,12 @@ def test_ja_tokenizer_sub_tokens(
143143
[
144144
(
145145
"取ってつけた",
146-
(["五段-ラ行;連用形-促音便"], [], ["下一段-カ行;連用形-一般"], ["助動詞-タ;終止形-一般"]),
146+
(
147+
["五段-ラ行;連用形-促音便"],
148+
[],
149+
["下一段-カ行;連用形-一般"],
150+
["助動詞-タ;終止形-一般"],
151+
),
147152
(["トッ"], ["テ"], ["ツケ"], ["タ"]),
148153
),
149154
("2=3", ([], [], []), (["ニ"], ["_"], ["サン"])),

spacy/tests/lang/ko/test_lemmatization.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,14 @@
22

33

44
@pytest.mark.parametrize(
5-
"word,lemma", [("새로운", "새롭"), ("빨간", "빨갛"), ("클수록", "크"), ("뭡니까", "뭣"), ("됐다", "되")]
5+
"word,lemma",
6+
[
7+
("새로운", "새롭"),
8+
("빨간", "빨갛"),
9+
("클수록", "크"),
10+
("뭡니까", "뭣"),
11+
("됐다", "되"),
12+
],
613
)
714
def test_ko_lemmatizer_assigns(ko_tokenizer, word, lemma):
815
test_lemma = ko_tokenizer(word)[0].lemma_

spacy/tests/lang/pl/test_text.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
"""Words like numbers are recognized correctly."""
2+
23
import pytest
34

45

spacy/tests/training/test_pretraining.py.disabled renamed to spacy/tests/training/test_pretraining.py

Lines changed: 44 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -265,50 +265,50 @@ def test_pretraining_tagger():
265265

266266

267267
# Try to debug segfault on windows
268-
#def test_pretraining_training():
269-
# """Test that training can use a pretrained Tok2Vec model"""
270-
# config = Config().from_str(pretrain_string_internal)
271-
# nlp = util.load_model_from_config(config, auto_fill=True, validate=False)
272-
# filled = nlp.config
273-
# pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
274-
# filled = pretrain_config.merge(filled)
275-
# train_config = util.load_config(DEFAULT_CONFIG_PATH)
276-
# filled = train_config.merge(filled)
277-
# with make_tempdir() as tmp_dir:
278-
# pretrain_dir = tmp_dir / "pretrain"
279-
# pretrain_dir.mkdir()
280-
# file_path = write_sample_jsonl(pretrain_dir)
281-
# filled["paths"]["raw_text"] = file_path
282-
# filled["pretraining"]["component"] = "tagger"
283-
# filled["pretraining"]["layer"] = "tok2vec"
284-
# train_dir = tmp_dir / "train"
285-
# train_dir.mkdir()
286-
# train_path, dev_path = write_sample_training(train_dir)
287-
# filled["paths"]["train"] = train_path
288-
# filled["paths"]["dev"] = dev_path
289-
# filled = filled.interpolate()
290-
# P = filled["pretraining"]
291-
# nlp_base = init_nlp(filled)
292-
# model_base = (
293-
# nlp_base.get_pipe(P["component"]).model.get_ref(P["layer"]).get_ref("embed")
294-
# )
295-
# embed_base = None
296-
# for node in model_base.walk():
297-
# if node.name == "hashembed":
298-
# embed_base = node
299-
# pretrain(filled, pretrain_dir)
300-
# pretrained_model = Path(pretrain_dir / "model3.bin")
301-
# assert pretrained_model.exists()
302-
# filled["initialize"]["init_tok2vec"] = str(pretrained_model)
303-
# nlp = init_nlp(filled)
304-
# model = nlp.get_pipe(P["component"]).model.get_ref(P["layer"]).get_ref("embed")
305-
# embed = None
306-
# for node in model.walk():
307-
# if node.name == "hashembed":
308-
# embed = node
309-
# # ensure that the tok2vec weights are actually changed by the pretraining
310-
# assert np.any(np.not_equal(embed.get_param("E"), embed_base.get_param("E")))
311-
# train(nlp, train_dir)
268+
def test_pretraining_training():
269+
"""Test that training can use a pretrained Tok2Vec model"""
270+
config = Config().from_str(pretrain_string_internal)
271+
nlp = util.load_model_from_config(config, auto_fill=True, validate=False)
272+
filled = nlp.config
273+
pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
274+
filled = pretrain_config.merge(filled)
275+
train_config = util.load_config(DEFAULT_CONFIG_PATH)
276+
filled = train_config.merge(filled)
277+
with make_tempdir() as tmp_dir:
278+
pretrain_dir = tmp_dir / "pretrain"
279+
pretrain_dir.mkdir()
280+
file_path = write_sample_jsonl(pretrain_dir)
281+
filled["paths"]["raw_text"] = file_path
282+
filled["pretraining"]["component"] = "tagger"
283+
filled["pretraining"]["layer"] = "tok2vec"
284+
train_dir = tmp_dir / "train"
285+
train_dir.mkdir()
286+
train_path, dev_path = write_sample_training(train_dir)
287+
filled["paths"]["train"] = train_path
288+
filled["paths"]["dev"] = dev_path
289+
filled = filled.interpolate()
290+
P = filled["pretraining"]
291+
nlp_base = init_nlp(filled)
292+
model_base = (
293+
nlp_base.get_pipe(P["component"]).model.get_ref(P["layer"]).get_ref("embed")
294+
)
295+
embed_base = None
296+
for node in model_base.walk():
297+
if node.name == "hashembed":
298+
embed_base = node
299+
pretrain(filled, pretrain_dir)
300+
pretrained_model = Path(pretrain_dir / "model3.bin")
301+
assert pretrained_model.exists()
302+
filled["initialize"]["init_tok2vec"] = str(pretrained_model)
303+
nlp = init_nlp(filled)
304+
model = nlp.get_pipe(P["component"]).model.get_ref(P["layer"]).get_ref("embed")
305+
embed = None
306+
for node in model.walk():
307+
if node.name == "hashembed":
308+
embed = node
309+
# ensure that the tok2vec weights are actually changed by the pretraining
310+
assert np.any(np.not_equal(embed.get_param("E"), embed_base.get_param("E")))
311+
train(nlp, train_dir)
312312

313313

314314
def write_sample_jsonl(tmp_dir):

0 commit comments

Comments
 (0)