From a9f6b151a64d735fdfbe60f58aa4cfcb562d98c2 Mon Sep 17 00:00:00 2001 From: searchivarius Date: Fri, 10 Feb 2023 14:35:26 -0500 Subject: [PATCH 1/3] upgrading Spacy. --- flexneuart/ir_datasets/spacy.py | 1 - flexneuart/text_proc/parse.py | 5 ++--- requirements.txt | 3 +-- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/flexneuart/ir_datasets/spacy.py b/flexneuart/ir_datasets/spacy.py index 60123207..2443f6d3 100644 --- a/flexneuart/ir_datasets/spacy.py +++ b/flexneuart/ir_datasets/spacy.py @@ -39,7 +39,6 @@ def __init__(self, input_fields : list, :param model_name a name of the spacy model to use, e.g., en_core_web_sm :param stop_word_file the name of the stop word file :param remove_punct a bool flag indicating if the punctuation tokens need to be removed - :param sent_split a bool flag indicating if sentence splitting is necessary :param keep_only_alpha_num a bool flag indicating if we need to keep only alpha-numeric characters :param enable_pos a bool flag that enables POS tagging (which, e.g., can improve lemmatization) """ diff --git a/flexneuart/text_proc/parse.py b/flexneuart/text_proc/parse.py index b595fdda..b67d28c5 100644 --- a/flexneuart/text_proc/parse.py +++ b/flexneuart/text_proc/parse.py @@ -42,7 +42,7 @@ def __init__(self, model_name): :param model_name: a name of the spacy model to use, e.g., en_core_web_sm """ self._nlp = spacy.load(model_name, disable=[SPACY_NER, SPACY_PARSER, SPACY_POS]) - self._nlp.add_pipe(self._nlp.create_pipe("sentencizer")) + self._nlp.add_pipe("sentencizer") def __call__(self, text): """A thin wrapper that merely calls spacy. @@ -111,8 +111,7 @@ def __init__(self, model_name, stop_words, self._nlp = spacy.load(model_name, disable=disable_list) if sent_split: - sentencizer = self._nlp.create_pipe("sentencizer") - self._nlp.add_pipe(sentencizer) + self._nlp.add_pipe("sentencizer") self._removePunct = remove_punct self._stopWords = frozenset([w.lower() for w in stop_words]) diff --git a/requirements.txt b/requirements.txt index 0d72b51b..9270f506 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,8 +16,7 @@ torch torchtext transformers -# we fix Spacy version, though in the future we would have to upgrade too -spacy==2.2.3 +spacy sentence-transformers sentencepiece krovetzstemmer From 3b847990cb3371fb0e1b8057f21341a1f5d9a77e Mon Sep 17 00:00:00 2001 From: searchivarius Date: Sat, 18 Feb 2023 23:33:32 -0500 Subject: [PATCH 2/3] removing explicit protobuf from reqs. --- requirements.txt | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index 9270f506..06a56293 100644 --- a/requirements.txt +++ b/requirements.txt @@ -29,6 +29,9 @@ typing-extensions # Colbert specific ujson -# Deeberta specific, might fail with older -# protobuf versions. -protobuf==3.20 + +# Deeberta specific, might fail with older protobuf versions. +# but let's not install it by default, actually newer +# transformer versions should install protbuf on its own +#protobuf==3.20 + From 467c64278c05b432b856addf80621367ce716f4b Mon Sep 17 00:00:00 2001 From: searchivarius Date: Tue, 21 Feb 2023 00:39:58 -0500 Subject: [PATCH 3/3] renaming TREC ROBUST04 queries sub-set --- .../data_convert/ir_datasets/sample_configs/trec_robust04.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/data_convert/ir_datasets/sample_configs/trec_robust04.json b/scripts/data_convert/ir_datasets/sample_configs/trec_robust04.json index 779a5b0f..809b42ed 100644 --- a/scripts/data_convert/ir_datasets/sample_configs/trec_robust04.json +++ b/scripts/data_convert/ir_datasets/sample_configs/trec_robust04.json @@ -48,7 +48,7 @@ ] }, { - "part_name": "all", + "part_name": "queries_all", "dataset_name": "trec-robust04", "is_query": true, "src_attributes": [