| 1 | #!/usr/bin/env python |
|---|
| 2 | |
|---|
| 3 | # Written by Adrianna Pinska |
|---|
| 4 | # Licence: GPLv3 |
|---|
| 5 | |
|---|
| 6 | import sys |
|---|
| 7 | import re |
|---|
| 8 | import urllib2 |
|---|
| 9 | import xml.dom.minidom |
|---|
| 10 | import os.path |
|---|
| 11 | import pickle |
|---|
| 12 | import time |
|---|
| 13 | |
|---|
| 14 | crosspost_limit = 3 |
|---|
| 15 | subject_filename = ".usenet_filter_subject_cache" |
|---|
| 16 | useragent = 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.16) Gecko/20080702 (Debian-1.8.1.16+nobinonly-0ubuntu1) Galeon/2.0.4 (Ubuntu 2.0.4-1ubuntu1)' |
|---|
| 17 | |
|---|
| 18 | # Add crap you don't want to see to these lists: |
|---|
| 19 | badgroups = ["celebrities", "gossip", "acting", "rec\.arts\.tv", "religion", "talk\.origins", "sci\.meow", "politics", "autism", "depression", "economics", "alt\.community", "kooks", "psychology", "revisionism", "comedy", "jewish"] |
|---|
| 20 | badauthors = ["Day Brown", "Colonel Jake"] |
|---|
| 21 | badkeywords = ["libertarian", "WTC", "truther", "9/11"] |
|---|
| 22 | |
|---|
| 23 | patterns = { |
|---|
| 24 | "badgroups": ("(%s)" % "|".join(badgroups), re.IGNORECASE), |
|---|
| 25 | "badauthors": ("(%s)" % "|".join(badauthors), re.IGNORECASE), |
|---|
| 26 | "badkeywords": ("(%s)" % "|".join(badkeywords), re.IGNORECASE), |
|---|
| 27 | "postid": (".*show_docid=([a-f0-9]*$)", 0), |
|---|
| 28 | "group": ("(.*) Google Group", 0), |
|---|
| 29 | "newsgroups": ("Newsgroups: (.*)", 0), |
|---|
| 30 | "author": ("From: (.*)", 0), |
|---|
| 31 | "charset": ('Content-Type: .*charset="?([^ ;"\n]*)"?', 0), |
|---|
| 32 | "body": ("\n\n(.*)", re.DOTALL), |
|---|
| 33 | } |
|---|
| 34 | |
|---|
| 35 | compiled = {} |
|---|
| 36 | for name, (pattern, flags) in patterns.items(): |
|---|
| 37 | compiled[name] = re.compile(pattern, flags) |
|---|
| 38 | |
|---|
| 39 | blacklisted_subjects = {} |
|---|
| 40 | if os.path.exists(subject_filename): |
|---|
| 41 | subject_file = open(subject_filename, "r") |
|---|
| 42 | blacklisted_subjects = pickle.loads(subject_file.read()) |
|---|
| 43 | subject_file.close() |
|---|
| 44 | |
|---|
| 45 | now = time.time() |
|---|
| 46 | |
|---|
| 47 | xml = xml.dom.minidom.parseString(sys.stdin.read()) |
|---|
| 48 | channel = xml.getElementsByTagName("rss")[0].getElementsByTagName("channel")[0] |
|---|
| 49 | channeltitle = channel.getElementsByTagName("title")[0].firstChild.data |
|---|
| 50 | group = compiled["group"].match(channeltitle).group(1) |
|---|
| 51 | |
|---|
| 52 | for item in channel.getElementsByTagName("item"): |
|---|
| 53 | # Check if subject matches blacklist or bad keyword |
|---|
| 54 | title = item.getElementsByTagName("title")[0].firstChild.data |
|---|
| 55 | if title in blacklisted_subjects or compiled["badkeywords"].search(title): |
|---|
| 56 | blacklisted_subjects[title] = now |
|---|
| 57 | channel.removeChild(item) |
|---|
| 58 | item.unlink() |
|---|
| 59 | continue |
|---|
| 60 | |
|---|
| 61 | # Otherwise fetch the whole post |
|---|
| 62 | link = item.getElementsByTagName("link")[0].firstChild.data |
|---|
| 63 | postid = compiled["postid"].match(link).group(1) |
|---|
| 64 | request = urllib2.Request("http://groups.google.co.za/group/%s/msg/%s?dmode=source&output=gplain" % (group, postid), None, {"User-Agent": useragent}) |
|---|
| 65 | response = urllib2.urlopen(request) |
|---|
| 66 | text = response.read() |
|---|
| 67 | |
|---|
| 68 | # Do more filtering |
|---|
| 69 | newsgroups = compiled["newsgroups"].search(text).group(1) |
|---|
| 70 | crosspost_count = len(newsgroups.split(",")) |
|---|
| 71 | author = compiled["author"].search(text).group(1) |
|---|
| 72 | body = compiled["body"].search(text).group(1) |
|---|
| 73 | |
|---|
| 74 | if crosspost_count > crosspost_limit or compiled["badgroups"].search(newsgroups) or compiled["badauthors"].search(author) or compiled["badkeywords"].search(body): |
|---|
| 75 | blacklisted_subjects[title] = now |
|---|
| 76 | channel.removeChild(item) |
|---|
| 77 | item.unlink() |
|---|
| 78 | continue |
|---|
| 79 | |
|---|
| 80 | # Put full body in feed |
|---|
| 81 | charset = compiled["charset"].search(text) |
|---|
| 82 | if charset: |
|---|
| 83 | body = body.decode(charset.group(1)).encode('utf-8') |
|---|
| 84 | item.getElementsByTagName("description")[0].firstChild.data = body |
|---|
| 85 | |
|---|
| 86 | # remove old subjects from blacklist |
|---|
| 87 | for subject, lastseen in list(blacklisted_subjects.items()): |
|---|
| 88 | if now - lastseen > 2*30*24*60*60: |
|---|
| 89 | del(blacklisted_subjects[subject]) |
|---|
| 90 | # save blacklist |
|---|
| 91 | subject_file = open(subject_filename, "w") |
|---|
| 92 | subject_file.write(pickle.dumps(blacklisted_subjects)) |
|---|
| 93 | subject_file.close() |
|---|
| 94 | |
|---|
| 95 | print xml.toxml(encoding="UTF-8") |
|---|