forked from DavidJacobson/SafeText
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsafetext.py
62 lines (52 loc) · 3.11 KB
/
safetext.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# SafeText
# This script takes an input of a text file, and removes any characters that could be unique identifiers that would
# reveal an otherwise confidential source.
# Inspiration https://www.zachaysan.com/writing/2017-12-30-zero-width-characters
# David Jacobson
import argparse
from characters_safetext import HOMOGLYPHS, ZERO_WIDTH_CHARS
def underline(chars):
return '\033[4m' + chars + '\033[0m'
# These are words that could fingerprint an author's location
# Information taken from https://en.oxforddictionaries.com/spelling/british-and-spelling
COUNTRY_SMELLS = ( # Expand this as well
"centre", "fibre", "litre", "theatre", "colour", "flavour", "humour", "labour", "neighbour", "apologise",
"organise", "recognise", "analyse", "breathalyse", "paralyse", "travelled", "travelling", "traveller", "paediatric",
"oestrogen", "manoeuvre", "leukaemia", "defence", "licence", "offence", "pretence", "analogue", "catalogue",
"dialogue", "grey", "tonne", "honour", "cancelled", "jewellery", "mould", "cheque", "pyjamas",
)
parser = argparse.ArgumentParser(description="Clean a text file of any identifying Unicode characters")
parser.add_argument('input', metavar='I', help='File to be cleaned')
args = parser.parse_args()
out_file_name = args.input + ".safe"
print("[*] Cleaning {} to {} ...".format(args.input, out_file_name))
with open(args.input, mode="r", encoding="UTF-8") as in_file: # File to process
lines = in_file.readlines() # Read the lines into memory
for index, line in enumerate(lines): # Use enum so we can keep track of the lines
line_to_display = line # This will be the line presented to the user, to highlight what characters were hidden
display_line = False
for character in ZERO_WIDTH_CHARS: # Checking starts here
if ZERO_WIDTH_CHARS[character] in line:
display_line = True
print("[!] FOUND a {} ON LINE # {}".format(character, index+1)) # +1 so it's human readable
line_to_display = line_to_display.replace(ZERO_WIDTH_CHARS[character], "*")
line = line.replace(ZERO_WIDTH_CHARS[character], "") # To actually remove
for letter in HOMOGLYPHS:
if HOMOGLYPHS[letter] in line:
display_line = True
# print(underline(HOMOGLYPHS[character]))
print("[!] FOUND HOMOGLYPHIC CHARACTER {} ON LINE {}".format(letter, index+1))
line_to_display = line_to_display.replace(HOMOGLYPHS[letter], underline(HOMOGLYPHS[letter]))
replacement_char = letter[-1]
line = line.replace(HOMOGLYPHS[letter], replacement_char)
if display_line: # If the line had to be modified, print it.
print(line_to_display.strip())
for word in COUNTRY_SMELLS:
if word in line.lower(): # Normalize
print("[!] WARNING - Use of spelling ({}) that identifies country on line {}".format(word, index+1))
lines[index] = line # Update
out_file = open(out_file_name, mode="w", encoding="UTF-8")
for line in lines:
out_file.write(line)
out_file.close()
print("[*] Output file closed")