Skip to content

Commit

Permalink
Updated IP addresses and boolean values parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
lcferrum committed Jul 31, 2018
1 parent 6bf7812 commit 12d6621
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 10 deletions.
9 changes: 5 additions & 4 deletions README
Original file line number Diff line number Diff line change
Expand Up @@ -107,10 +107,11 @@ current directory under filename 'hosts'.

By default, after script finishes processing hosts file, it deletes it. To
control this behavior 'Keep' variable is used - if it translates to True
(values '1', 'yes', 'true' and 'on'), file won't be deleted after being
processed. If it translates to False (values '0', 'no', 'false' and 'off') or
variable is omitted - default action takes place and file becomes deleted. In
this example, hosts file won't be deleted after being processed.
(values 'yes', 'true', 'on' and non-zero decimals), file won't be deleted after
being processed. If it translates to False (values 'no', 'false', 'off' and
zero equivalents) or variable is omitted - default action takes place and file
becomes deleted. In this example, hosts file won't be deleted after being
processed.

Hosts files can have wide variety of encodings. So, when reading hosts file,
script uses encoding specified in Content-Type header or in current locale, if
Expand Down
20 changes: 14 additions & 6 deletions adhosts2privoxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@
import ConfigParser

# Keep in mind, if you use non-ASCII characters in defaults, you should change their object type from str ("") to unicode (u"") one
# Except user_agent - this shoud stay str object ("")
# And dont't forget to specify correct source file encoding
# Except user_agent - this should stay str object ("")
# And don't forget to specify correct source file encoding

# When current locale encoding can't be determined, fallback_encoding is used
# So it's actually better to set proper POSIX locale environment variables, instead of changing this default
Expand Down Expand Up @@ -49,7 +49,7 @@ def ProcessHostsFile(domain_tree, section, url, file, keep, encoding):
hosts_encoding = encoding

if url:
SafePrint(u"Dowloading {}...".format(section))
SafePrint(u"Downloading {}...".format(section))

# Reasons behind converting back and forth to UTF-8:
# urllib2.quote and urllib.quote_plus choke on non-ASCII characters in unicode objects (any kind of str objects are ok)
Expand Down Expand Up @@ -104,7 +104,7 @@ def ProcessHostsFile(domain_tree, section, url, file, keep, encoding):
for line in hosts.readlines():
if not re.match(white_pattern, line):
line_match = block_pattern.match(line)
line_items = line_match and line_match.group(2) is not None and line_match.group(2).strip().split()
line_items = line_match and line_match.group(4) is not None and line_match.group(4).strip().split()
if line_items:
prc_count += 1
for alias, hostname in enumerate(line_items):
Expand Down Expand Up @@ -133,6 +133,12 @@ def ProcessHostsFile(domain_tree, section, url, file, keep, encoding):

if not keep: os.remove(hosts_path)

def GetConfigBoolean(config, section, option):
try:
return bool(config.getfloat(section, option))
except ValueError:
return config.getboolean(section, option)

def GetTimestamp(dt):
return "{0} {1.day: >2} {1:%H:%M:%S %Y}".format(rfc3164_months[dt.month - 1], dt)

Expand All @@ -145,7 +151,9 @@ def GetTimestamp(dt):
SafePrint(u"Licensed under BSD 2-Clause License");
exit(1)

block_pattern = re.compile("^\s*(0\.0\.0\.0|127\.\d{1,3}\.\d{1,3}\.\d{1,3}|::1|::)\s+([\w\s.-]+)#?", re.UNICODE)
# Block pattern conforms (in a sane way) to RFC 4291 and ID draft-main-ipaddr-text-rep-02

block_pattern = re.compile("^\s*(0+\.0+\.0+\.0+|127\.\d+\.\d+\.\d+|(0{0,4}:){1,7}(0{0,4}|0{0,3}1))\s+([\w\s.-]+)#?", re.UNICODE)
white_pattern = re.compile("^\s*#.*$|^\s*$")
encoding_pattern = re.compile("^([^']+)'[\w-]*'(.+)")
rfc3164_months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
Expand Down Expand Up @@ -177,7 +185,7 @@ def GetTimestamp(dt):
for section in config.sections():
action_file.write(u"# {}".format(config.get(section, "Url") or config.get(section, "File")) + os.linesep)
try:
ProcessHostsFile(domain_tree, section, config.get(section, "Url"), config.get(section, "File"), config.getboolean(section, "Keep"), config.get(section, "Encoding"))
ProcessHostsFile(domain_tree, section, config.get(section, "Url"), config.get(section, "File"), GetConfigBoolean(config, section, "Keep"), config.get(section, "Encoding"))
except UnicodeError as e:
SafePrint(u"Codec error ({}): {}".format(e.encoding, e.message or e.reason))
except urllib2.HTTPError as e:
Expand Down

0 comments on commit 12d6621

Please sign in to comment.