From a9f6b151a64d735fdfbe60f58aa4cfcb562d98c2 Mon Sep 17 00:00:00 2001
From: searchivarius <leo@boytsov.info>
Date: Fri, 10 Feb 2023 14:35:26 -0500
Subject: [PATCH 1/3] upgrading Spacy.

---
 flexneuart/ir_datasets/spacy.py | 1 -
 flexneuart/text_proc/parse.py   | 5 ++---
 requirements.txt                | 3 +--
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/flexneuart/ir_datasets/spacy.py b/flexneuart/ir_datasets/spacy.py
index 60123207..2443f6d3 100644
--- a/flexneuart/ir_datasets/spacy.py
+++ b/flexneuart/ir_datasets/spacy.py
@@ -39,7 +39,6 @@ def __init__(self, input_fields : list,
         :param  model_name     a name of the spacy model to use, e.g., en_core_web_sm
         :param  stop_word_file  the name of the stop word file
         :param  remove_punct  a bool flag indicating if the punctuation tokens need to be removed
-        :param  sent_split    a bool flag indicating if sentence splitting is necessary
         :param  keep_only_alpha_num a bool flag indicating if we need to keep only alpha-numeric characters
         :param  enable_pos    a bool flag that enables POS tagging (which, e.g., can improve lemmatization)
         """
diff --git a/flexneuart/text_proc/parse.py b/flexneuart/text_proc/parse.py
index b595fdda..b67d28c5 100644
--- a/flexneuart/text_proc/parse.py
+++ b/flexneuart/text_proc/parse.py
@@ -42,7 +42,7 @@ def __init__(self, model_name):
         :param model_name: a name of the spacy model to use, e.g., en_core_web_sm
         """
         self._nlp = spacy.load(model_name, disable=[SPACY_NER, SPACY_PARSER, SPACY_POS])
-        self._nlp.add_pipe(self._nlp.create_pipe("sentencizer"))
+        self._nlp.add_pipe("sentencizer")
 
     def __call__(self, text):
         """A thin wrapper that merely calls spacy.
@@ -111,8 +111,7 @@ def __init__(self, model_name, stop_words,
 
         self._nlp = spacy.load(model_name, disable=disable_list)
         if sent_split:
-            sentencizer = self._nlp.create_pipe("sentencizer")
-            self._nlp.add_pipe(sentencizer)
+            self._nlp.add_pipe("sentencizer")
 
         self._removePunct = remove_punct
         self._stopWords = frozenset([w.lower() for w in stop_words])
diff --git a/requirements.txt b/requirements.txt
index 0d72b51b..9270f506 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -16,8 +16,7 @@ torch
 torchtext
 
 transformers
-# we fix Spacy version, though in the future we would have to upgrade too
-spacy==2.2.3
+spacy
 sentence-transformers
 sentencepiece
 krovetzstemmer

From 3b847990cb3371fb0e1b8057f21341a1f5d9a77e Mon Sep 17 00:00:00 2001
From: searchivarius <leo@boytsov.info>
Date: Sat, 18 Feb 2023 23:33:32 -0500
Subject: [PATCH 2/3] removing explicit protobuf from reqs.

---
 requirements.txt | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 9270f506..06a56293 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -29,6 +29,9 @@ typing-extensions
 
 # Colbert specific
 ujson
-# Deeberta specific, might fail with older
-# protobuf versions.
-protobuf==3.20
+
+# Deeberta specific, might fail with older protobuf versions.
+# but let's not install it by default, actually newer 
+# transformer versions should install protbuf on its own
+#protobuf==3.20
+

From 467c64278c05b432b856addf80621367ce716f4b Mon Sep 17 00:00:00 2001
From: searchivarius <leo@boytsov.info>
Date: Tue, 21 Feb 2023 00:39:58 -0500
Subject: [PATCH 3/3] renaming TREC ROBUST04 queries sub-set

---
 .../data_convert/ir_datasets/sample_configs/trec_robust04.json  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/data_convert/ir_datasets/sample_configs/trec_robust04.json b/scripts/data_convert/ir_datasets/sample_configs/trec_robust04.json
index 779a5b0f..809b42ed 100644
--- a/scripts/data_convert/ir_datasets/sample_configs/trec_robust04.json
+++ b/scripts/data_convert/ir_datasets/sample_configs/trec_robust04.json
@@ -48,7 +48,7 @@
         ]
     },
     {
-        "part_name": "all",
+        "part_name": "queries_all",
         "dataset_name": "trec-robust04",
         "is_query": true,
         "src_attributes": [