-
Notifications
You must be signed in to change notification settings - Fork 61
/
sse_validate_utf16le_proof.py
137 lines (101 loc) · 3.46 KB
/
sse_validate_utf16le_proof.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# Note: Validation is done for 8-word input, we just need to check 3^8 = 6561 cases
# Validation for 16-word inputs reqires 3**16 = 43'046'721 checks
ELEMENTS_COUNT = 8
ALL_MASK = (1 << ELEMENTS_COUNT) - 1
ALL_BUT_ONE_MASK = (ALL_MASK >> 1)
# 'V' - single-word character (always valid)
# 'L' - low surrogate (must be followed by the high surrogate)
# 'H' - high surrogate
def all_sequences():
index = ['V'] * ELEMENTS_COUNT
def increment():
nonlocal index
for i in range(ELEMENTS_COUNT):
if index[i] == 'V':
index[i] = 'L'
return False
if index[i] == 'L':
index[i] = 'H'
return False
if index[i] == 'H':
index[i] = 'V' # wrap around
pass
return True
overflow = False
while not overflow:
yield index
overflow = increment()
def find_error_in_words(words):
prev = None
if words[0] == 'H':
# We assume that our vector algorithm loads proper data into vectors.
# In the case low surrogate was the last item in the previous iteration.
return 'high surrogate must not start a chunk'
for i, kind in enumerate(words):
if kind == 'V':
if prev == 'L':
return f'low surrogate {i - 1} must be followed by high surrogate'
elif kind == 'L':
if prev == 'L':
return f'low surrogate {i - 1} must be followed by high surrogate'
elif kind == 'H':
if prev != 'L':
return f'high surrogate {i} must be preceded by low surrogate'
prev = kind
return ''
def bitmask(words, state):
result = 0
for bit, type in enumerate(words):
if type == state:
# In SSE vector algorithm we compare 2 x 16 higher bytes of input
# words, which yields a 16-bit mask.
result |= 1 << bit
return result
def mask(words):
L = bitmask(words, 'L')
H = bitmask(words, 'H')
V = (~(L | H)) & ALL_MASK
a = L & (H >> 1)
b = a << 1
c = V | a | b
return c
def dump():
for words in all_sequences():
c = mask(words)
words_image = "[ %s ]" % ' | '.join(words)
error = find_error_in_words(words)
if error == '':
valid_image = 'T'
else:
valid_image = ' '
print(words_image, valid_image, '{:016b} {:04x}'.format(c, c))
def proof():
case1_hit = False
case2_hit = False
for words in all_sequences():
c = mask(words)
if c == ALL_MASK:
case1_hit = True
# all 16 words are valid (either 'V' or pairs 'L', 'H')
assert find_error_in_words(words) == '', (words, find_error_in_words(words))
if c == ALL_BUT_ONE_MASK:
case2_hit = True
# all 15 words are valid (either 'V' or pairs 'L', 'H')
# the last words is either 'L' or 'H' (the word will be
# re-examined in the next iteration of an algorithm)
if words[-1] == 'H':
assert find_error_in_words(words) == 'high surrogate 7 must be preceded by low surrogate'
elif words[-1] == 'L':
assert find_error_in_words(words) == ''
else:
assert False
assert case1_hit
assert case2_hit
print("All OK")
def main():
if 0:
dump()
else:
proof()
if __name__ == '__main__':
main()