-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathciphertext-decryption.py
More file actions
191 lines (157 loc) · 7.57 KB
/
ciphertext-decryption.py
File metadata and controls
191 lines (157 loc) · 7.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
import string
import textwrap
import numpy as np
import random as rand
import collections as col
from sklearn.svm import SVC
from collections import defaultdict as dd
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
def graph(features, id, ax):
j_values = np.array([row[id] for row in features])
colors = np.array(['red', 'blue', 'green', 'orange', 'purple', 'pink', 'brown', 'gray', 'black', 'cyan', 'magenta', 'olive', 'gold', 'teal', 'navy', 'maroon', 'crimson', 'lime', 'coral', 'indigo', 'violet', 'turquoise', 'chocolate', 'fuchsia', 'slate', 'khaki'])
cmap = plt.cm.get_cmap('hsv', len(colors))
rgba_colors = cmap(np.arange(len(colors)))
ax.scatter([chr(i + 97) for i in range(len(j_values))], j_values, c=rgba_colors)
return ax
def extract_feature_FR(alphabet, letters, NL, accuracy):
feature_FR = dict.fromkeys(alphabet, 0)
for l in letters:
if l in alphabet:
feature_FR[l] += 1/NL
round_dict(feature_FR, accuracy)
return feature_FR
def extract_feature_WL(alphabet, words, word_length, letters_times, accuracy):
feature_WL = dict.fromkeys(alphabet, 0)
for w in words:
if word_length == 1 and len(w) == word_length and w in alphabet:
feature_WL[w] = 1
elif (has_domain(word_length, 2, 4) and len(w) == word_length) or \
(has_domain(word_length, 5, 7) and has_domain(len(w), 5, 7)) or \
(has_domain(word_length, 8, 10) and has_domain(len(w), 8, 10)) or \
(word_length > 10 and len(w) >= word_length):
for l in list(w):
if l in alphabet:
feature_WL[l] += 1/letters_times.get(l)
round_dict(feature_WL, accuracy)
return feature_WL
def extract_feature_LP(alphabet, case, words, letters_times, accuracy):
feature_LP = dict.fromkeys(alphabet, 0)
for w in words:
first = list(w)[0]
last = list(w)[len(w)-1]
if "first" in case and first in alphabet:
feature_LP[first] += 1/letters_times.get(first)
elif "last" in case and last in alphabet:
feature_LP[last] += 1/letters_times.get(last)
elif first == last and first in alphabet:
feature_LP[first] += 1/letters_times.get(first)
round_dict(feature_LP, accuracy)
return feature_LP
def extract_feature_DL(alphabet, words, letters_times, accuracy):
feature_DL = dict.fromkeys(alphabet, 0)
for w in words:
prev_letter = "#"
for l in list(w):
if len(w) != 1 and prev_letter == l and l in alphabet:
feature_DL[l] += 1 / letters_times.get(l)
prev_letter = l
round_dict(feature_DL, accuracy)
return feature_DL
def has_domain(var, point1, point2):
return True if point1 <= var <= point2 else False
def round_dict(dict, accuracy):
return {key: round(val, accuracy) for key, val in dict.items()}
def get_letters(words):
return [letter for word in [list(w) for w in words] for letter in word]
def divide_chunks(list, n):
for i in range(0, len(list), n):
yield list[i:i + n]
def is_shaffled_alphabet(key):
return len(key) == len(set(key))
def update_y(fy, y):
count = 0
for i in range(len(fy)):
if fy[i] == "NN":
if y[count] not in fy:
fy[i] = y[count]
count += 1
return fy
def update_alphabet(alphabet, fy):
return [val for val in alphabet if val not in fy]
def load_local_data():
training_text = "TRAINING-tolstoy-anna-karenina.txt"
testing_text = "TESTING-pushkin-eugene-onegin.txt"
decryption_alphabet = "rgbhdtkclvnqjxfspamioyzweu" # encryption_alphabet = "rcheyobdtmgiskuqlapfzjxnvw"
return training_text, testing_text, decryption_alphabet
def decrypt(text, fy, alphabet):
decr_dict = {alphabet[i]: fy[i] for i in range(len(alphabet))}
with open(text, 'r') as f:
encr_text = f.read()
decr_text = "".join([decr_dict[encr_char] if encr_char in decr_dict else encr_char
for encr_char in get_letters(encr_text)])
wrapped_text = textwrap.fill(decr_text, 140)
with open("output.txt", 'w') as output:
output.write(wrapped_text)
print('\033[1m' + "Decrypted Ciphertext:\n" + '\033[0m' + wrapped_text)
def process(super_words, alphabet, chunks):
accuracy = 10
fig, axs = plt.subplots(nrows=4, ncols=3, figsize=(12, 8))
features, labels = [], []
sub_words = list(divide_chunks(super_words, chunks))
for words in sub_words:
letters = get_letters(words)
NL = len(letters)
letters_times = dict(sorted({key: value for key, value in dict(col.Counter(letters)).items()}.items()))
f0 = extract_feature_FR(alphabet, letters, NL, accuracy)
f1, f2, f3, f4, f5, f6, f7 = [extract_feature_WL(alphabet, words, i, letters_times, accuracy) for i in (1, 2, 3, 4, rand.randint(5,7), rand.randint(8,10), 11)]
f8, f9, f10 = (extract_feature_LP(alphabet, pos, words, letters_times, accuracy) for pos in ("first", "last", "both"))
f11 = extract_feature_DL(alphabet, words, letters_times, accuracy)
temp_features = dd(list) # defines an empty dictionary
for d in (f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11):
for key, value in d.items():
temp_features[key].append(value)
temp_features = dict(temp_features) # gets only the dictionary-part
temp_features = list(temp_features.values()) # converts dictionary of lists into a list of lists
for i, ax in enumerate(axs.flatten()):
ax = graph(temp_features, i, ax)
ax.set_title(f'Feature {i}')
features.extend(temp_features)
labels.extend(alphabet)
plt.tight_layout() # adjusts the spacing between subplots to improve readability
plt.show()
X, y = np.array(features), np.array(labels)
return X, y
def main():
training_text, testing_text, decryption_alphabet = load_local_data()
done = False
alphabet = list(string.ascii_lowercase)
final_y = ["NN" for a in range(len(alphabet))]
np.set_printoptions(suppress=True) # to avoid scientific notation when printing
with open(training_text, 'r') as TR_f:
TR_words = TR_f.read().split()
with open(testing_text, 'r') as TE_f:
TE_words = TE_f.read().split()
while not done:
print('\033[1m' + "\nTraining Process: " + '\033[0m' + "a,b,c,...,z refer to the real-letters of the English alphabet.")
X_train, y_train = process(TR_words, alphabet, 400)
svc = SVC()
svc.fit(X_train, y_train)
print('\033[1m' + "\nTesting Process: " + '\033[0m' + "a,b,c,...,z refer to cipher-letters!")
X_test = process(TE_words, alphabet, len(list(TE_words)))[0]
y_pred = svc.predict(X_test)
final_y = update_y(final_y, y_pred)
y_test = list(decryption_alphabet)
if is_shaffled_alphabet(y_pred):
done = True
else:
alphabet = update_alphabet(alphabet, final_y)
if len(alphabet) == 1:
final_y = update_y(final_y, alphabet) # no prediction needed!
done = True
accuracy = accuracy_score(y_test, final_y)
print('\033[1m' + "\nAccuracy Classification Score: " + '\033[0m' + "{:.2f}".format(accuracy))
complete_alphabet = list(string.ascii_lowercase)
decrypt(testing_text, final_y, complete_alphabet)
if __name__ == "__main__":
main()