-
Notifications
You must be signed in to change notification settings - Fork 3
/
preprocessing.py
109 lines (90 loc) · 3.93 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import re
def untokenize(text):
"""
Untokenizing a text undoes the tokenizing operation, restoring
punctuation and spaces to the places that people expect them to be.
Ideally, `untokenize(tokenize(text))` should be identical to `text`,
except for line breaks.
credit: https://stackoverflow.com/questions/21948019/python-untokenize-a-sentence
"""
text = text.strip()
text = text.replace("-lrb-", "(")
text = text.replace("-rrb-", ")")
text = text.replace("`` ", '" ').replace(" ''", ' "').replace(". . .", "...")
text = re.sub(r' ([.,:;?!%]+)([ \'"`])', r"\1\2", text)
text = re.sub(r" ([.,:;?!%]+)$", r"\1", text)
return text.strip()
def e2e_preprocess_function(examples, data_args, tokenizer):
prefix = data_args.source_prefix if data_args.source_prefix is not None else ""
max_target_length = data_args.max_target_length
padding = "max_length" if data_args.pad_to_max_length else False
inputs = examples["meaning_representation"]
targets = examples["human_reference"]
inputs = [text.replace("[", " is ").replace("]", "") for text in inputs]
inputs = [prefix + inp for inp in inputs]
model_inputs = tokenizer(
inputs,
max_length=data_args.max_source_length,
padding=padding,
truncation=True,
)
# Setup the tokenizer for targets
with tokenizer.as_target_tokenizer():
labels = tokenizer(
targets, max_length=max_target_length, padding=padding, truncation=True
)
# If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
# padding in the loss.
if padding == "max_length" and data_args.ignore_pad_token_for_loss:
labels["input_ids"] = [
[(l if l != tokenizer.pad_token_id else -100) for l in label]
for label in labels["input_ids"]
]
model_inputs["labels"] = labels["input_ids"]
return model_inputs
def synthbio_preprocess_function(examples, data_args, tokenizer):
prefix = data_args.source_prefix if data_args.source_prefix is not None else ""
max_target_length = data_args.max_target_length
padding = "max_length" if data_args.pad_to_max_length else False
raw_inputs = examples["input_text"]
raw_targets = examples["target_text"]
inputs = []
for input_text in raw_inputs:
input_data = ""
for field_name, content in zip(
input_text["table"]["column_header"], input_text["table"]["content"],
):
if field_name in ["image", "website", "caption", "source"]:
continue
if input_data != "":
input_data += ", "
if len(content) > 0:
if content[-1] in [",", "."]:
content = content[:-1]
content = untokenize(content)
input_data += f"{field_name} is {content}"
inputs.append(input_data.lower())
model_inputs = tokenizer(
inputs,
max_length=data_args.max_source_length,
padding=padding,
truncation=True,
)
targets = []
for target_text in raw_targets:
sen = target_text.split("\n")[0]
targets.append(untokenize(sen).lower())
# Setup the tokenizer for targets
with tokenizer.as_target_tokenizer():
labels = tokenizer(
targets, max_length=max_target_length, padding=padding, truncation=True
)
# If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
# padding in the loss.
if padding == "max_length" and data_args.ignore_pad_token_for_loss:
labels["input_ids"] = [
[(l if l != tokenizer.pad_token_id else -100) for l in label]
for label in labels["input_ids"]
]
model_inputs["labels"] = labels["input_ids"]
return model_inputs