This repository has been archived by the owner on Oct 21, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathinstructions.py
1566 lines (1223 loc) · 51.6 KB
/
instructions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# coding=utf-8
# Copyright 2023 The Google Research Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Library of instructions."""
import collections
import json
import random
import re
import string
from typing import Dict, Optional, Sequence, Union
from absl import logging
import langdetect
import instructions_util
_InstructionArgsDtype = Optional[Dict[str, Union[int, str, Sequence[str]]]]
_LANGUAGES = instructions_util.LANGUAGE_CODES
# The relational operation for comparison.
_COMPARISON_RELATION = ("less than", "at least")
# The maximum number of sentences.
_MAX_NUM_SENTENCES = 20
# The number of placeholders.
_NUM_PLACEHOLDERS = 4
# The number of bullet lists.
_NUM_BULLETS = 5
# The options of constrained response.
_CONSTRAINED_RESPONSE_OPTIONS = (
"My answer is yes.", "My answer is no.", "My answer is maybe.")
# The options of starter keywords.
_STARTER_OPTIONS = ("I would say", "My answer is", "I believe",
"In my opinion", "I think", "I reckon", "I feel",
"From my perspective", "As I see it", "According to me",
"As far as I'm concerned", "To my understanding",
"In my view", "My take on it is", "As per my perception")
# The options of ending keywords.
# TODO(jeffreyzhou) add more ending options
_ENDING_OPTIONS = ("Any other questions?",
"Is there anything else I can help with?")
# The number of highlighted sections.
_NUM_HIGHLIGHTED_SECTIONS = 4
# The section spliter.
_SECTION_SPLITER = ("Section", "SECTION")
# The number of sections.
_NUM_SECTIONS = 5
# The number of paragraphs.
_NUM_PARAGRAPHS = 5
# The postscript marker.
_POSTSCRIPT_MARKER = ("P.S.", "P.P.S")
# The number of keywords.
_NUM_KEYWORDS = 2
# The occurrences of a single keyword.
_KEYWORD_FREQUENCY = 3
# The occurrences of a single letter.
_LETTER_FREQUENCY = 10
# The occurrences of words with all capital letters.
_ALL_CAPITAL_WORD_FREQUENCY = 20
# The number of words in the response.
_NUM_WORDS_LOWER_LIMIT = 100
_NUM_WORDS_UPPER_LIMIT = 500
class Instruction:
"""An instruction template."""
def __init__(self, instruction_id):
self.id = instruction_id
def build_description(self, **kwargs):
raise NotImplementedError("`build_description` not implemented.")
def get_instruction_args(self):
raise NotImplementedError("`get_instruction_args` not implemented.")
def get_instruction_args_keys(self):
raise NotImplementedError("`get_instruction_args_keys` not implemented.")
def check_following(self, value):
raise NotImplementedError("`check_following` not implemented.")
class ResponseLanguageChecker(Instruction):
"""Check the language of the entire response."""
def build_description(self, *, language = None):
"""Build the instruction description.
Args:
language: A string representing the expected language of the response. The
language has to comply to the 97 types defined in
`langid.py` (https://pypi.org/project/langid/1.1.5/), which follows
ISO 639-1 codes (https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes);
for example, `en` for English, `zh` for Chinese, `fr` for French.
Returns:
A string representing the instruction description.
"""
self._language = language
if self._language is None:
self._language = random.choice(list(_LANGUAGES.keys()))
# TODO(tianjianlu): opens the description generation to more choices.
self._description_pattern = (
"Your ENTIRE response should be in {language} language, no other " +
"language is allowed.")
return self._description_pattern.format(language=_LANGUAGES[self._language])
def get_instruction_args(self):
"""Returns the keyward args of `build_description`."""
return {"language": self._language}
def get_instruction_args_keys(self):
"""Returns the args keys of `build_description`."""
return ["language"]
def check_following(self, value):
"""Check if the language of the entire response follows the instruction.
Args:
value: A string representing the response.
Returns:
True if the language of `value` follows instruction; otherwise False.
"""
assert isinstance(value, str)
try:
return langdetect.detect(value) == self._language
except langdetect.LangDetectException as e:
# Count as instruction is followed.
logging.error(
"Unable to detect language for text %s due to %s", value, e
) # refex: disable=pytotw.037
return True
class NumberOfSentences(Instruction):
"""Check the number of sentences."""
def build_description(self, *, num_sentences = None,
relation = None):
"""Build the instruction description.
Args:
num_sentences: An integer specifying the number of sentences as a
threshold.
relation: A string in (`less than`, `at least`), defining the relational
operator for comparison.
Two relational comparisons are supported for now:
if 'less than', the actual number of sentences < the threshold;
if 'at least', the actual number of sentences >= the threshold.
Returns:
A string representing the instruction description.
"""
# The number of sentences as a threshold for comparison.
self._num_sentences_threshold = num_sentences
if (self._num_sentences_threshold is None or
self._num_sentences_threshold < 0):
self._num_sentences_threshold = random.randint(1, _MAX_NUM_SENTENCES)
if relation is None:
self._comparison_relation = random.choice(_COMPARISON_RELATION)
elif relation not in _COMPARISON_RELATION:
raise ValueError("The supported relation for comparison must be in "
f"{_COMPARISON_RELATION}, but {relation} is given.")
else:
self._comparison_relation = relation
self._description_pattern = (
"Your response should contain {relation} {num_sentences} sentences.")
return self._description_pattern.format(
relation=self._comparison_relation,
num_sentences=self._num_sentences_threshold)
def get_instruction_args(self):
"""Returns the keyward args of `build_description`."""
return {"num_sentences": self._num_sentences_threshold,
"relation": self._comparison_relation}
def get_instruction_args_keys(self):
"""Returns the args keys of `build_description`."""
return ["num_sentences", "relation"]
def check_following(self, value):
"""Check if the number of sentences follows the instruction.
Args:
value: A string representing the response.
Returns:
True if the response follows the instruction.
Raise:
ValueError if the string in `instruction_args` is not in
[`less_than`, `at_least`].
"""
num_sentences = instructions_util.count_sentences(value)
if self._comparison_relation == _COMPARISON_RELATION[0]:
return num_sentences < self._num_sentences_threshold
elif self._comparison_relation == _COMPARISON_RELATION[1]:
return num_sentences >= self._num_sentences_threshold
class PlaceholderChecker(Instruction):
"""Check the placeholders in template writing."""
def build_description(self, *, num_placeholders = None):
"""Build the instruction description.
Args:
num_placeholders: An integer denoting the minimum number of
placeholders required in the response.
Returns:
A string representing the instruction description.
"""
self._num_placeholders = num_placeholders
if self._num_placeholders is None or self._num_placeholders < 0:
self._num_placeholders = random.randint(1, _NUM_PLACEHOLDERS)
self._description_pattern = (
"The response must contain at least {num_placeholders} placeholders " +
"represented by square brackets, such as [address].")
return self._description_pattern.format(
num_placeholders=self._num_placeholders)
def get_instruction_args(self):
"""Returns the keyward args of `build_description`."""
return {"num_placeholders": self._num_placeholders}
def get_instruction_args_keys(self):
"""Returns the args keys of `build_description`."""
return ["num_placeholders"]
def check_following(self, value):
"""Check if the number of placeholders follows the instruction.
Args:
value: A string representing the response.
Returns:
True if the actual number of placeholders in the response is greater than
or equal to `num_placeholders`; otherwise, False.
"""
placeholders = re.findall(r"\[.*?\]", value)
num_placeholders = len(placeholders)
return num_placeholders >= self._num_placeholders
class BulletListChecker(Instruction):
"""Checks the bullet list in the prompt."""
def build_description(self, *, num_bullets = None):
"""Build the instruction description.
Args:
num_bullets: An integer specifying the exact number of bullet lists
that is required to appear in the response.
Returns:
A string representing the instruction description.
"""
self._num_bullets = num_bullets
if self._num_bullets is None or self._num_bullets < 0:
self._num_bullets = random.randint(1, _NUM_BULLETS)
self._description_pattern = (
"Your answer must contain exactly {num_bullets} bullet points. " +
"Use the markdown bullet points such as:\n" +
"* This is point 1. \n" +
"* This is point 2")
return self._description_pattern.format(
num_bullets=self._num_bullets)
def get_instruction_args(self):
"""Returns the keyward args of `build_description`."""
return {"num_bullets": self._num_bullets}
def get_instruction_args_keys(self):
"""Returns the args keys of `build_description`."""
return ["num_bullets"]
def check_following(self, value):
r"""Check if the number of bullet lists meets the requirement.
Args:
value: A string representing the response. The response is expected to
contain some bullet lists that start with `\*`.
Returns:
True if the actual number of bullet lists in the response meets the
requirement.
"""
bullet_lists = re.findall(r"^\s*\*[^\*].*$", value, flags=re.MULTILINE)
bullet_lists_2 = re.findall(r"^\s*-.*$", value, flags=re.MULTILINE)
num_bullet_lists = len(bullet_lists) + len(bullet_lists_2)
return num_bullet_lists == self._num_bullets
class ConstrainedResponseChecker(Instruction):
"""Checks the constrained response."""
def build_description(self):
"""Build the instruction description."""
# A sequence of string(s) representing the options of the expected response.
self._constrained_responses = _CONSTRAINED_RESPONSE_OPTIONS
self._description_pattern = (
"Answer with one of the following options: {response_options}")
return self._description_pattern.format(
response_options=self._constrained_responses)
def get_instruction_args(self):
"""Returns the keyward args of `build_description`."""
return None
def get_instruction_args_keys(self):
"""Returns the args keys of `build_description`."""
return []
def check_following(self, value):
"""Checks if the response matches the constrained options.
Args:
value: A string representing the response.
Returns:
True if the actual response contains one of the options in the constrained
responses; otherwise False.
"""
value = value.strip()
for constrained_response in self._constrained_responses:
if constrained_response in value:
return True
return False
class ConstrainedStartChecker(Instruction):
"""Checks the response start."""
def build_description(self, *, starter = None):
"""Build the instruction description.
Args:
starter: A string representing the keyward that the response should start
with.
Returns:
A string representing the instruction description.
"""
self._starter = starter.strip() if isinstance(starter, str) else starter
if self._starter is None:
self._starter = random.choice(_STARTER_OPTIONS)
self._description_pattern = (
"During the conversation, when it is your turn, " +
"please always start with {starter}")
return self._description_pattern.format(starter=self._starter)
def get_instruction_args(self):
"""Returns the keyward args of `build_description`."""
return {"starter": self._starter}
def get_instruction_args_keys(self):
"""Returns the args keys of `build_description`."""
return ["starter"]
def check_following(self, value):
"""Checks if the response starts with the constrained keyword or phrase.
Args:
value: A string representing the response.
Returns:
True if the response starts with the given phrase or keyword that is
contained in `instruction_args`; otherwise, False.
"""
response_pattern = r"^\s*" + self._starter + r".*$"
response_with_constrained_start = re.search(response_pattern, value,
flags=re.MULTILINE)
return True if response_with_constrained_start else False
class HighlightSectionChecker(Instruction):
"""Checks the highlighted section."""
def build_description(self, *, num_highlights = None):
"""Build the instruction description.
Args:
num_highlights: An integer specifying the minimum number of highlighted
sections.
Returns:
A string representing the instruction description.
"""
self._num_highlights = num_highlights
if self._num_highlights is None or self._num_highlights < 0:
self._num_highlights = random.randint(1, _NUM_HIGHLIGHTED_SECTIONS)
self._description_pattern = (
"Highlight at least {num_highlights} sections in your answer with " +
"markdown, i.e. *highlighted section*.")
return self._description_pattern.format(num_highlights=self._num_highlights)
def get_instruction_args(self):
"""Returns the keyward args of `build_description`."""
return {"num_highlights": self._num_highlights}
def get_instruction_args_keys(self):
"""Returns the args keys of `build_description`."""
return ["num_highlights"]
def check_following(self, value):
"""Checks if the number of highlighted sections meets the requirement.
Args:
value: a string repesenting the response. The response is expected to
contain highlighted sections in the format of *highlighted*.
Returns:
True if the actual number of highlighted sections in the format of
*highlighed sections* meets the minimum requirement; otherwise False.
"""
num_highlights = 0
highlights = re.findall(r"\*[^\n\*]*\*", value)
double_highlights = re.findall(r"\*\*[^\n\*]*\*\*", value)
for highlight in highlights:
if highlight.strip("*").strip():
num_highlights += 1
for highlight in double_highlights:
if highlight.removeprefix("**").removesuffix("**").strip():
num_highlights += 1
return num_highlights >= self._num_highlights
class SectionChecker(Instruction):
"""Checks the sections."""
def build_description(self, *, section_spliter = None,
num_sections = None):
"""Build the instruction description.
Args:
section_spliter: A string represents the section spliter keyword that
marks a new section, i.e., `Section` or `SECTION`.
num_sections: An integer specifying the number of sections.
Returns:
A string representing the instruction description.
"""
self._section_spliter = section_spliter.strip() if isinstance(
section_spliter, str) else section_spliter
if self._section_spliter is None:
self._section_spliter = random.choice(_SECTION_SPLITER)
self._num_sections = num_sections
if self._num_sections is None or self._num_sections < 0:
self._num_sections = random.randint(1, _NUM_SECTIONS)
self._description_pattern = (
"Your response must have {num_sections} sections. Mark the beginning " +
"of each section with {section_spliter} X, such as:\n" +
"{section_spliter} 1\n" +
"[content of section 1]\n" +
"{section_spliter} 2\n" +
"[content of section 2]")
return self._description_pattern.format(
num_sections=self._num_sections,
section_spliter=self._section_spliter)
def get_instruction_args(self):
"""Returns the keyward args of `build_description`."""
return {"section_spliter": self._section_spliter,
"num_sections": self._num_sections}
def get_instruction_args_keys(self):
"""Returns the args keys of `build_description`."""
return ["section_spliter", "num_sections"]
def check_following(self, value):
"""Checks the response contains multiple sections.
Args:
value: A string representing the response. The response is expected
to contain multiple sections (number of sections is greater than 1).
A new section starts with `Section 1`, where the number denotes the
section index.
Returns:
True if the number of sections in the response is greater than or equal to
the minimum number of sections; otherwise, False.
"""
section_splitter_patten = r"\s?" + self._section_spliter + r"\s?\d+\s?"
sections = re.split(section_splitter_patten, value)
num_sections = len(sections) - 1
return num_sections >= self._num_sections
class ParagraphChecker(Instruction):
"""Checks the paragraphs."""
def build_description(self, *, num_paragraphs = None):
"""Build the instruction description.
Args:
num_paragraphs: An integer specifying the number of paragraphs.
Returns:
A string representing the instruction description.
"""
self._num_paragraphs = num_paragraphs
if self._num_paragraphs is None or self._num_paragraphs < 0:
self._num_paragraphs = random.randint(1, _NUM_PARAGRAPHS)
self._description_pattern = (
"There should be {num_paragraphs} paragraphs. " +
"Paragraphs are separated with the markdown divider: ***")
return self._description_pattern.format(num_paragraphs=self._num_paragraphs)
def get_instruction_args(self):
"""Returns the keyward args of `build_description`."""
return {"num_paragraphs": self._num_paragraphs}
def get_instruction_args_keys(self):
"""Returns the args keys of `build_description`."""
return ["num_paragraphs"]
def check_following(self, value):
"""Checks the response contains required number of paragraphs.
Args:
value: A string representing the response. The response may contain
paragraphs that are separated by the markdown divider: `***`.
Returns:
True if the actual number of paragraphs is the same as required;
otherwise, False.
"""
paragraphs = re.split(r"\s?\*\*\*\s?", value)
num_paragraphs = len(paragraphs)
for index, paragraph in enumerate(paragraphs):
if not paragraph.strip():
if index == 0 or index == len(paragraphs) - 1:
num_paragraphs -= 1
else:
return False
return num_paragraphs == self._num_paragraphs
class PostscriptChecker(Instruction):
"""Checks the postscript."""
def build_description(self, *, postscript_marker = None
):
"""Build the instruction description.
Args:
postscript_marker: A string containing the keyword that marks the start
of the postscript section.
Returns:
A string representing the instruction description.
"""
self._postscript_marker = postscript_marker.strip() if isinstance(
postscript_marker, str) else postscript_marker
if self._postscript_marker is None:
self._postscript_marker = random.choice(_POSTSCRIPT_MARKER)
self._description_pattern = (
"At the end of your response, please explicitly add a postscript " +
"starting with {postscript}")
return self._description_pattern.format(postscript=self._postscript_marker)
def get_instruction_args(self):
"""Returns the keyward args of `build_description`."""
return {"postscript_marker": self._postscript_marker}
def get_instruction_args_keys(self):
"""Returns the args keys of `build_description`."""
return ["postscript_marker"]
def check_following(self, value):
"""Checks if the response follows the postscript format.
Args:
value: a string representing the response. The response is expected to
contain a postscript section.
Returns:
True if the response contains a postscript section starting with
the keyword containing in the `instruction_args`; otherwise False.
"""
value = value.lower()
if self._postscript_marker == "P.P.S":
postscript_pattern = r"\s*p\.\s?p\.\s?s.*$"
elif self._postscript_marker == "P.S.":
postscript_pattern = r"\s*p\.\s?s\..*$"
else:
postscript_pattern = r"\s*" + self._postscript_marker.lower() + r".*$"
postscript = re.findall(postscript_pattern, value, flags=re.MULTILINE)
return True if postscript else False
class RephraseChecker(Instruction):
"""Checks the repharse."""
def build_description(self, *, original_message):
"""Build the instruction description.
Args:
original_message: A string representing the original message. The
rephrased response should only change its words/sentences in between
its two asterisks, for example, *change me*. Both original and rephrased
messages should contain the changes in the form of *change me*.
Returns:
A string representing the instruction description.
"""
if not self.is_change(original_message):
raise ValueError(f"Message {original_message} does not contain changes "
"in the form of *change me*.")
self._reference_without_change = original_message
self._description = ("Rephrasing: Your rephrased response should only" +
"change the words/sentences in between two asterisks" +
"such as *change me*.")
return self._description
def get_instruction_args(self):
"""Returns the keyward args of `build_description`."""
return {"original_message": self._reference_without_change}
def get_instruction_args_keys(self):
"""Returns the args keys of `build_description`."""
return ["original_message"]
def check_following(self, value):
r"""Checks if the rephrasing follows the instruction.
Args:
value: A string representing the response, which is expected to rephras
the string of `instruction_args`.
Returns:
True if `value` and `instruction_args` only differ by the words/sentences
in between two asterisks such as *change me*; otherwise, False.
"""
if not self.is_change(value):
raise ValueError(f"value {value} does not contain "
"changes in the form of *change me*.")
response_without_changes = self.strip_changes(value)
reference_without_changes = self.strip_changes(
self._reference_without_change)
return response_without_changes == reference_without_changes
def is_change(self, response):
"""Check if there is change in the response in the form of *change me*."""
return re.search(r"\*.*\*", response)
def strip_changes(self, response):
"""Strips off the changes."""
return re.sub(r"\*.*\*", "", response)
class KeywordChecker(Instruction):
"""Check the exisitence of certain keywords."""
def build_description(self, *, keywords = None
):
"""Build the instruction description.
Args:
keywords: A sequence of strings representing the keywords that are
expected in the response.
Returns:
A string representing the instruction description.
"""
if not keywords:
self._keywords = instructions_util.generate_keywords(
num_keywords=_NUM_KEYWORDS)
else:
self._keywords = keywords
self._keywords = sorted(self._keywords)
self._description_pattern = ("Include keywords {keywords} in the response.")
return self._description_pattern.format(keywords=self._keywords)
def get_instruction_args(self):
"""Returns the keyward args of `build_description`."""
return {"keywords": self._keywords}
def get_instruction_args_keys(self):
"""Returns the args keys of `build_description`."""
return ["keywords"]
def check_following(self, value):
"""Check if the response contain the expected keywords."""
for keyword in self._keywords:
if not re.search(keyword, value, flags=re.IGNORECASE):
return False
return True
class KeywordFrequencyChecker(Instruction):
"""Check the keyword frequency."""
def build_description(self, *, keyword = None,
frequency = None,
relation = None):
"""Build the instruction description.
Args:
keyword: A string representing a keyword that is expected in the response.
frequency: An integer specifying the number of times `keyword` is expected
to appear in the response.
relation: A string in (`less than`, `at least`), defining the relational
operator for comparison.
Two relational comparisons are supported for now:
if 'less than', the actual number of occurrences < frequency;
if 'at least', the actual number of occurrences >= frequency.
Returns:
A string representing the instruction description.
"""
if not keyword:
self._keyword = instructions_util.generate_keywords(num_keywords=1)[0]
else:
self._keyword = keyword.strip()
self._frequency = frequency
if self._frequency is None or self._frequency < 0:
self._frequency = random.randint(1, _KEYWORD_FREQUENCY)
if relation is None:
self._comparison_relation = random.choice(_COMPARISON_RELATION)
elif relation not in _COMPARISON_RELATION:
raise ValueError("The supported relation for comparison must be in "
f"{_COMPARISON_RELATION}, but {relation} is given.")
else:
self._comparison_relation = relation
self._description_pattern = (
"In your response, the word {keyword} should appear {relation} " +
"{frequency} times.")
return self._description_pattern.format(
keyword=self._keyword,
relation=self._comparison_relation,
frequency=self._frequency)
def get_instruction_args(self):
"""Returns the keyward args of `build_description`."""
return {"keyword": self._keyword,
"frequency": self._frequency,
"relation": self._comparison_relation}
def get_instruction_args_keys(self):
"""Returns the args keys of `build_description`."""
return ["keyword", "frequency", "relation"]
def check_following(self, value):
"""Checks if the response contain the keyword with required frequency."""
actual_occurrences = len(re.findall(
self._keyword, value, flags=re.IGNORECASE))
if self._comparison_relation == _COMPARISON_RELATION[0]:
return actual_occurrences < self._frequency
elif self._comparison_relation == _COMPARISON_RELATION[1]:
return actual_occurrences >= self._frequency
class NumberOfWords(Instruction):
"""Checks the number of words."""
def build_description(self, *, num_words = None,
relation = None):
"""Build the instruction description.
Args:
num_words: An integer specifying the number of words contained in the
response.
relation: A string in (`less than`, `at least`), defining the relational
operator for comparison.
Two relational comparisons are supported for now:
if 'less than', the actual number of words < num_words;
if 'at least', the actual number of words >= num_words.
Returns:
A string representing the instruction description.
"""
self._num_words = num_words
if self._num_words is None or self._num_words < 0:
self._num_words = random.randint(
_NUM_WORDS_LOWER_LIMIT, _NUM_WORDS_UPPER_LIMIT
)
if relation is None:
self._comparison_relation = random.choice(_COMPARISON_RELATION)
elif relation not in _COMPARISON_RELATION:
raise ValueError("The supported relation for comparison must be in "
f"{_COMPARISON_RELATION}, but {relation} is given.")
else:
self._comparison_relation = relation
self._description_pattern = (
"Answer with {relation} {num_words} words.")
return self._description_pattern.format(
relation=self._comparison_relation,
num_words=self._num_words)
def get_instruction_args(self):
"""Returns the keyward args of `build_description`."""
return {"num_words": self._num_words,
"relation": self._comparison_relation}
def get_instruction_args_keys(self):
"""Returns the args keys of `build_description`."""
return ["num_words", "relation"]
def check_following(self, value):
"""Checks if the response contains the expected number of words."""
num_words = instructions_util.count_words(value)
if self._comparison_relation == _COMPARISON_RELATION[0]:
return num_words < self._num_words
elif self._comparison_relation == _COMPARISON_RELATION[1]:
return num_words >= self._num_words
class JsonFormat(Instruction):
"""Check the Json format."""
def build_description(self):
self._description_pattern = (
"Entire output should be wrapped in JSON format. You can use markdown"
" ticks such as ```."
)
return self._description_pattern
def get_instruction_args(self):
"""Returns the keyward args of `build_description`."""
return None
def get_instruction_args_keys(self):
"""Returns the args keys of `build_description`."""
return []
def check_following(self, value):
value = (
value.strip()
.removeprefix("```json")
.removeprefix("```Json")
.removeprefix("```JSON")
.removeprefix("```")
.removesuffix("```")
.strip()
)
try:
json.loads(value)
except ValueError as _:
return False
return True
class ParagraphFirstWordCheck(Instruction):
"""Check the paragraph and the first word of the nth paragraph."""
def build_description(self, num_paragraphs = None,
nth_paragraph = None,
first_word = None):
r"""Build the instruction description.
Args:
num_paragraphs: An integer indicating the number of paragraphs expected
in the response. A paragraph is a subset of the string that is
expected to be separated by '\n\n'.
nth_paragraph: An integer indicating the paragraph number that we look at.
Note that n starts from 1.
first_word: A string that represent the first word of the bth paragraph.
Returns:
A string representing the instruction description.
"""
self._num_paragraphs = num_paragraphs
if self._num_paragraphs is None or self._num_paragraphs < 0:
self._num_paragraphs = random.randint(1, _NUM_PARAGRAPHS)
self._nth_paragraph = nth_paragraph
if (
self._nth_paragraph is None
or self._nth_paragraph <= 0
or self._nth_paragraph > self._num_paragraphs
):
self._nth_paragraph = random.randint(1, self._num_paragraphs + 1)
self._first_word = first_word
if self._first_word is None:
self._first_word = instructions_util.generate_keywords(num_keywords=1)[0]
self._first_word = self._first_word.lower()
self._description_pattern = (
"There should be {num_paragraphs} paragraphs. " +
"Paragraphs and only paragraphs are separated with each other by two " +
"new lines as if it was '\\n\\n' in python. " +
"Paragraph {nth_paragraph} must start with word {first_word}.")
return self._description_pattern.format(
num_paragraphs=self._num_paragraphs,
nth_paragraph=self._nth_paragraph,
first_word=self._first_word)
def get_instruction_args(self):
"""Returns the keyward args of `build_description`."""
return {"num_paragraphs": self._num_paragraphs,
"nth_paragraph": self._nth_paragraph,
"first_word": self._first_word}
def get_instruction_args_keys(self):
"""Returns the args keys of `build_description`."""
return ["num_paragraphs", "nth_paragraph", "first_word"]
def check_following(self, value):
"""Checks for required number of paragraphs and correct first word.
Args:
value: a string representing the response. The response may contain
paragraphs that are separated by two new lines and the first word of
the nth paragraph will have to match a specified word.
Returns:
True if the number of paragraphs is the same as required and the first
word of the specified paragraph is the same as required. Otherwise, false.
"""
paragraphs = re.split(r"\n\n", value)
num_paragraphs = len(paragraphs)
for paragraph in paragraphs:
if not paragraph.strip():
num_paragraphs -= 1
# check that index doesn't go out of bounds
if self._nth_paragraph <= num_paragraphs:
paragraph = paragraphs[self._nth_paragraph - 1].strip()
if not paragraph:
return False
else:
return False
first_word = ""
punctuation = {".", ",", "?", "!", "'", '"'}
# get first word and remove punctuation
word = paragraph.split()[0].strip()
# TODO(jeffrey): make more complex?
word = word.lstrip("'")
word = word.lstrip('"')