#!/usr/bin/env python # Written by Adrianna Pinska # Licence: GPLv3 import sys import re import urllib2 import xml.dom.minidom import os.path import pickle import time crosspost_limit = 3 subject_filename = ".usenet_filter_subject_cache" useragent = 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.16) Gecko/20080702 (Debian-1.8.1.16+nobinonly-0ubuntu1) Galeon/2.0.4 (Ubuntu 2.0.4-1ubuntu1)' # Add crap you don't want to see to these lists: badgroups = ["celebrities", "gossip", "acting", "rec\.arts\.tv", "religion", "talk\.origins", "sci\.meow", "politics", "autism", "depression", "economics", "alt\.community", "kooks", "psychology", "revisionism", "comedy", "jewish"] badauthors = ["Day Brown", "Colonel Jake"] badkeywords = ["libertarian", "WTC", "truther", "9/11"] patterns = { "badgroups": ("(%s)" % "|".join(badgroups), re.IGNORECASE), "badauthors": ("(%s)" % "|".join(badauthors), re.IGNORECASE), "badkeywords": ("(%s)" % "|".join(badkeywords), re.IGNORECASE), "postid": (".*show_docid=([a-f0-9]*$)", 0), "group": ("(.*) Google Group", 0), "newsgroups": ("Newsgroups: (.*)", 0), "author": ("From: (.*)", 0), "charset": ('Content-Type: .*charset="?([^ ;"\n]*)"?', 0), "body": ("\n\n(.*)", re.DOTALL), } compiled = {} for name, (pattern, flags) in patterns.items(): compiled[name] = re.compile(pattern, flags) blacklisted_subjects = {} if os.path.exists(subject_filename): subject_file = open(subject_filename, "r") blacklisted_subjects = pickle.loads(subject_file.read()) subject_file.close() now = time.time() xml = xml.dom.minidom.parseString(sys.stdin.read()) channel = xml.getElementsByTagName("rss")[0].getElementsByTagName("channel")[0] channeltitle = channel.getElementsByTagName("title")[0].firstChild.data group = compiled["group"].match(channeltitle).group(1) for item in channel.getElementsByTagName("item"): # Check if subject matches blacklist or bad keyword title = item.getElementsByTagName("title")[0].firstChild.data if title in blacklisted_subjects or compiled["badkeywords"].search(title): blacklisted_subjects[title] = now channel.removeChild(item) item.unlink() continue # Otherwise fetch the whole post link = item.getElementsByTagName("link")[0].firstChild.data postid = compiled["postid"].match(link).group(1) request = urllib2.Request("http://groups.google.co.za/group/%s/msg/%s?dmode=source&output=gplain" % (group, postid), None, {"User-Agent": useragent}) response = urllib2.urlopen(request) text = response.read() # Do more filtering newsgroups = compiled["newsgroups"].search(text).group(1) crosspost_count = len(newsgroups.split(",")) author = compiled["author"].search(text).group(1) body = compiled["body"].search(text).group(1) if crosspost_count > crosspost_limit or compiled["badgroups"].search(newsgroups) or compiled["badauthors"].search(author) or compiled["badkeywords"].search(body): blacklisted_subjects[title] = now channel.removeChild(item) item.unlink() continue # Put full body in feed charset = compiled["charset"].search(text) if charset: body = body.decode(charset.group(1)).encode('utf-8') item.getElementsByTagName("description")[0].firstChild.data = body # remove old subjects from blacklist for subject, lastseen in list(blacklisted_subjects.items()): if now - lastseen > 2*30*24*60*60: del(blacklisted_subjects[subject]) # save blacklist subject_file = open(subject_filename, "w") subject_file.write(pickle.dumps(blacklisted_subjects)) subject_file.close() print xml.toxml(encoding="UTF-8")