|
| 1 | +#!/usr/bin/env python |
| 2 | + |
| 3 | +# krill - the hacker's way of keeping up with the world |
| 4 | +# |
| 5 | +# Copyright (c) 2015 Philipp Emanuel Weidmann <[email protected]> |
| 6 | +# |
| 7 | +# Nemo vir est qui mundum non reddat meliorem. |
| 8 | +# |
| 9 | +# Released under the terms of the GNU General Public License, version 3 |
| 10 | +# (https://gnu.org/licenses/gpl.html) |
| 11 | + |
| 12 | + |
| 13 | +try: |
| 14 | + # Python 3 |
| 15 | + from urllib.request import urlopen |
| 16 | +except ImportError: |
| 17 | + # Python 2 |
| 18 | + from urllib2 import urlopen |
| 19 | + |
| 20 | +import re |
| 21 | +import sys |
| 22 | +import codecs |
| 23 | +import hashlib |
| 24 | +import argparse |
| 25 | +from time import sleep, mktime |
| 26 | +from datetime import datetime |
| 27 | +from collections import namedtuple |
| 28 | + |
| 29 | +import feedparser |
| 30 | +from bs4 import BeautifulSoup |
| 31 | +from blessings import Terminal |
| 32 | + |
| 33 | + |
| 34 | + |
| 35 | +StreamItem = namedtuple("StreamItem", ["source", "time", "text", "link"]) |
| 36 | + |
| 37 | + |
| 38 | + |
| 39 | +class StreamParser: |
| 40 | + def _html_to_text(self, html): |
| 41 | + # Hack to prevent Beautiful Soup from collapsing space-keeping tags |
| 42 | + # until no whitespace remains at all |
| 43 | + html = re.sub("<(br|p)", " \\g<0>", html, flags=re.IGNORECASE) |
| 44 | + text = BeautifulSoup(html, "html.parser").get_text() |
| 45 | + # Idea from http://stackoverflow.com/a/1546251 |
| 46 | + return " ".join(text.strip().split()) |
| 47 | + |
| 48 | + |
| 49 | + def get_tweets(self, html): |
| 50 | + document = BeautifulSoup(html, "html.parser") |
| 51 | + |
| 52 | + for tweet in document.find_all("p", class_="tweet-text"): |
| 53 | + header = tweet.find_previous("div", class_="stream-item-header") |
| 54 | + |
| 55 | + name = header.find("strong", class_="fullname").string |
| 56 | + username = header.find("span", class_="username").b.string |
| 57 | + |
| 58 | + time_string = header.find("span", class_="_timestamp")["data-time"] |
| 59 | + time = datetime.fromtimestamp(int(time_string)) |
| 60 | + |
| 61 | + # For Python 2 and 3 compatibility |
| 62 | + to_unicode = unicode if sys.version_info[0] < 3 else str |
| 63 | + # Remove ellipsis characters added by Twitter |
| 64 | + text = self._html_to_text(to_unicode(tweet).replace(u"\u2026", "")) |
| 65 | + |
| 66 | + link = "https://twitter.com%s" % header.find("a", class_="tweet-timestamp")["href"] |
| 67 | + |
| 68 | + yield StreamItem("%s (@%s)" % (name, username), time, text, link) |
| 69 | + |
| 70 | + |
| 71 | + def get_feed_items(self, xml): |
| 72 | + feed_data = feedparser.parse(xml) |
| 73 | + |
| 74 | + for entry in feed_data.entries: |
| 75 | + time = datetime.fromtimestamp(mktime(entry.published_parsed)) |
| 76 | + text = "%s - %s" % (entry.title, self._html_to_text(entry.description)) |
| 77 | + yield StreamItem(feed_data.feed.title, time, text, entry.link) |
| 78 | + |
| 79 | + |
| 80 | + |
| 81 | +class TextExcerpter: |
| 82 | + # Clips the text to the position succeeding the first whitespace string |
| 83 | + def _clip_left(self, text): |
| 84 | + return re.sub("^\S*\s*", "", text, 1) |
| 85 | + |
| 86 | + |
| 87 | + # Clips the text to the position preceding the last whitespace string |
| 88 | + def _clip_right(self, text): |
| 89 | + return re.sub("\s*\S*$", "", text, 1) |
| 90 | + |
| 91 | + |
| 92 | + # Returns a portion of text at most max_length in length |
| 93 | + # and containing the first match of pattern, if specified |
| 94 | + def get_excerpt(self, text, pattern=None, max_length=300): |
| 95 | + if len(text) <= max_length: |
| 96 | + return text, False, False |
| 97 | + |
| 98 | + if pattern is None: |
| 99 | + return self._clip_right(text[0:max_length]), False, True |
| 100 | + else: |
| 101 | + match = pattern.search(text) |
| 102 | + start, end = match.span() |
| 103 | + match_text = match.group() |
| 104 | + remaining_length = max_length - len(match_text) |
| 105 | + if remaining_length <= 0: |
| 106 | + # Matches are never clipped |
| 107 | + return match_text |
| 108 | + |
| 109 | + excerpt_start = max(start - (remaining_length // 2), 0) |
| 110 | + excerpt_end = min(end + (remaining_length - (start - excerpt_start)), len(text)) |
| 111 | + # Adjust start of excerpt in case the string after the match was too short |
| 112 | + excerpt_start = max(excerpt_end - max_length, 0) |
| 113 | + excerpt = text[excerpt_start:excerpt_end] |
| 114 | + if excerpt_start > 0: |
| 115 | + excerpt = self._clip_left(excerpt) |
| 116 | + if excerpt_end < len(text): |
| 117 | + excerpt = self._clip_right(excerpt) |
| 118 | + |
| 119 | + return excerpt, excerpt_start > 0, excerpt_end < len(text) |
| 120 | + |
| 121 | + |
| 122 | + |
| 123 | +class Application: |
| 124 | + _known_hashes = set() |
| 125 | + |
| 126 | + |
| 127 | + def __init__(self, args): |
| 128 | + self.args = args |
| 129 | + |
| 130 | + |
| 131 | + def _print_error(self, error): |
| 132 | + print("") |
| 133 | + print(Terminal().bright_red(error)) |
| 134 | + |
| 135 | + |
| 136 | + def _get_stream_items(self, url): |
| 137 | + try: |
| 138 | + data = urlopen(url).read() |
| 139 | + except Exception as error: |
| 140 | + self._print_error("Unable to retrieve data from URL '%s': %s" % (url, str(error))) |
| 141 | + # The problem might be temporary, so we do not exit |
| 142 | + return list() |
| 143 | + |
| 144 | + parser = StreamParser() |
| 145 | + if "//twitter.com/" in url: |
| 146 | + return parser.get_tweets(data) |
| 147 | + else: |
| 148 | + return parser.get_feed_items(data) |
| 149 | + |
| 150 | + |
| 151 | + def _read_file(self, filename): |
| 152 | + try: |
| 153 | + with open(filename, "r") as myfile: |
| 154 | + lines = [line.strip() for line in myfile.readlines()] |
| 155 | + except Exception as error: |
| 156 | + self._print_error("Unable to read file '%s': %s" % (filename, str(error))) |
| 157 | + sys.exit(1) |
| 158 | + |
| 159 | + # Discard empty lines and comments |
| 160 | + return [line for line in lines if line and not line.startswith("#")] |
| 161 | + |
| 162 | + |
| 163 | + def _print_stream_item(self, item, pattern=None): |
| 164 | + print("") |
| 165 | + |
| 166 | + term = Terminal() |
| 167 | + time_label = "%s at %s" % (term.yellow(item.time.strftime("%a, %d %b %Y")), |
| 168 | + term.yellow(item.time.strftime("%H:%M"))) |
| 169 | + print("%s on %s:" % (term.bright_cyan(item.source), time_label)) |
| 170 | + |
| 171 | + excerpter = TextExcerpter() |
| 172 | + excerpt, clipped_left, clipped_right = excerpter.get_excerpt(item.text, pattern) |
| 173 | + |
| 174 | + # Hashtag or mention |
| 175 | + excerpt = re.sub("(?<!\w)([#@])(\w+)", |
| 176 | + term.green("\\g<1>") + term.bright_green("\\g<2>") + term.bright_white, |
| 177 | + excerpt) |
| 178 | + # URL in one of the forms commonly encountered on the web |
| 179 | + excerpt = re.sub("(\w+://)?[\w.-]+\.[a-zA-Z]{2,4}(?(1)|/)[\w#?&=%/:.-]*", |
| 180 | + term.bright_magenta_underline("\\g<0>") + term.bright_white, |
| 181 | + excerpt) |
| 182 | + |
| 183 | + if pattern is not None: |
| 184 | + # TODO: This can break previously applied highlighting (e.g. URLs) |
| 185 | + excerpt = pattern.sub(term.black_on_bright_yellow("\\g<0>") + term.bright_white, |
| 186 | + excerpt) |
| 187 | + |
| 188 | + print(" %s%s%s" % ("... " if clipped_left else "", |
| 189 | + term.bright_white(excerpt), |
| 190 | + " ..." if clipped_right else "")) |
| 191 | + print(" %s" % term.bright_blue_underline(item.link)) |
| 192 | + |
| 193 | + |
| 194 | + def update(self): |
| 195 | + # Reload sources and filters to allow for live editing |
| 196 | + sources = list() |
| 197 | + if self.args.sources is not None: |
| 198 | + sources.extend(self.args.sources) |
| 199 | + if self.args.sources_file is not None: |
| 200 | + sources.extend(self._read_file(self.args.sources_file)) |
| 201 | + if not sources: |
| 202 | + self._print_error("No source specifications found") |
| 203 | + sys.exit(1) |
| 204 | + |
| 205 | + filters = list() |
| 206 | + if self.args.filters is not None: |
| 207 | + filters.extend(self.args.filters) |
| 208 | + if self.args.filters_file is not None: |
| 209 | + filters.extend(self._read_file(self.args.filters_file)) |
| 210 | + |
| 211 | + patterns = list() |
| 212 | + for filter_string in filters: |
| 213 | + try: |
| 214 | + patterns.append(re.compile(filter_string, re.IGNORECASE)) |
| 215 | + except Exception as error: |
| 216 | + self._print_error("Error while compiling regular expression '%s': %s" % |
| 217 | + (filter_string, str(error))) |
| 218 | + sys.exit(1) |
| 219 | + |
| 220 | + items = list() |
| 221 | + def add_item(item, pattern=None): |
| 222 | + # Note that item.time is excluded from duplicate detection |
| 223 | + # as it sometimes changes without affecting the content |
| 224 | + hash_code = hashlib.md5((item.source + item.text + item.link) |
| 225 | + .encode("utf-8")).hexdigest() |
| 226 | + if hash_code in self._known_hashes: |
| 227 | + # Do not print an item more than once |
| 228 | + return |
| 229 | + self._known_hashes.add(hash_code) |
| 230 | + items.append((item, pattern)) |
| 231 | + |
| 232 | + for source in sources: |
| 233 | + for item in self._get_stream_items(source): |
| 234 | + if patterns: |
| 235 | + for pattern in patterns: |
| 236 | + if pattern.search(item.text): |
| 237 | + add_item(item, pattern) |
| 238 | + break |
| 239 | + else: |
| 240 | + # No filter patterns specified; simply print all items |
| 241 | + add_item(item) |
| 242 | + |
| 243 | + # Print latest news last |
| 244 | + items.sort(key=lambda item: item[0].time) |
| 245 | + |
| 246 | + for item in items: |
| 247 | + self._print_stream_item(item[0], item[1]) |
| 248 | + |
| 249 | + |
| 250 | + def run(self): |
| 251 | + term = Terminal() |
| 252 | + print("%s (%s)" % (term.bold("krill 0.1.0"), |
| 253 | + term.underline("https://github.com/p-e-w/krill"))) |
| 254 | + |
| 255 | + while True: |
| 256 | + try: |
| 257 | + self.update() |
| 258 | + if self.args.update_interval <= 0: |
| 259 | + break |
| 260 | + sleep(self.args.update_interval) |
| 261 | + except KeyboardInterrupt: |
| 262 | + # Do not print stacktrace if user exits with Ctrl+C |
| 263 | + sys.exit() |
| 264 | + |
| 265 | + |
| 266 | + |
| 267 | +def main(): |
| 268 | + # Force UTF-8 encoding for stdout as we will be printing Unicode characters |
| 269 | + # which will fail with a UnicodeEncodeError if the encoding is not set, |
| 270 | + # e.g. because stdout is being piped. |
| 271 | + # See http://www.macfreek.nl/memory/Encoding_of_Python_stdout and |
| 272 | + # http://stackoverflow.com/a/4546129 for extensive discussions of the issue. |
| 273 | + if sys.stdout.encoding != "UTF-8": |
| 274 | + # For Python 2 and 3 compatibility |
| 275 | + prev_stdout = sys.stdout if sys.version_info[0] < 3 else sys.stdout.buffer |
| 276 | + sys.stdout = codecs.getwriter("utf-8")(prev_stdout) |
| 277 | + |
| 278 | + arg_parser = argparse.ArgumentParser(prog="krill", description="Read and filter web feeds.") |
| 279 | + arg_parser.add_argument("-s", "--sources", nargs="+", |
| 280 | + help="URLs to pull data from", metavar="URL") |
| 281 | + arg_parser.add_argument("-S", "--sources-file", |
| 282 | + help="file from which to load source URLs", metavar="FILE") |
| 283 | + arg_parser.add_argument("-f", "--filters", nargs="+", |
| 284 | + help="patterns used to select feed items to print", metavar="REGEX") |
| 285 | + arg_parser.add_argument("-F", "--filters-file", |
| 286 | + help="file from which to load filter patterns", metavar="FILE") |
| 287 | + arg_parser.add_argument("-u", "--update-interval", default=300, type=int, |
| 288 | + help="time between successive feed updates " + |
| 289 | + "(default: 300 seconds, 0 for single pull only)", metavar="SECONDS") |
| 290 | + args = arg_parser.parse_args() |
| 291 | + |
| 292 | + if args.sources is None and args.sources_file is None: |
| 293 | + arg_parser.error("either a source URL (-s) or a sources file (-S) must be given") |
| 294 | + |
| 295 | + Application(args).run() |
| 296 | + |
| 297 | + |
| 298 | + |
| 299 | +if __name__ == "__main__": |
| 300 | + main() |
0 commit comments