root/confluence/feedfilters/usenetfilter.py

Revision 582, 3.6 kB (checked in by confluence, 14 months ago)

work in progress

  • Property svn:executable set to *
Line 
1#!/usr/bin/env python
2
3# Written by Adrianna Pinska
4# Licence: GPLv3
5
6import sys
7import re
8import urllib2
9import xml.dom.minidom
10import os.path
11import pickle
12import time
13
14crosspost_limit = 3
15subject_filename = ".usenet_filter_subject_cache"
16useragent = 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.16) Gecko/20080702 (Debian-1.8.1.16+nobinonly-0ubuntu1) Galeon/2.0.4 (Ubuntu 2.0.4-1ubuntu1)'
17
18# Add crap you don't want to see to these lists:
19badgroups = ["celebrities", "gossip", "acting", "rec\.arts\.tv", "religion", "talk\.origins", "sci\.meow", "politics", "autism", "depression", "economics", "alt\.community", "kooks", "psychology", "revisionism", "comedy", "jewish"]
20badauthors = ["Day Brown", "Colonel Jake"]
21badkeywords = ["libertarian", "WTC", "truther", "9/11"]
22
23patterns = {
24    "badgroups": ("(%s)" % "|".join(badgroups), re.IGNORECASE),
25    "badauthors": ("(%s)" % "|".join(badauthors), re.IGNORECASE),
26    "badkeywords": ("(%s)" % "|".join(badkeywords), re.IGNORECASE),
27    "postid": (".*show_docid=([a-f0-9]*$)", 0),
28    "group": ("(.*) Google Group", 0),
29    "newsgroups": ("Newsgroups: (.*)", 0),
30    "author": ("From: (.*)", 0),
31    "charset": ('Content-Type: .*charset="?([^ ;"\n]*)"?', 0),
32    "body": ("\n\n(.*)", re.DOTALL),
33}
34
35compiled = {}
36for name, (pattern, flags) in patterns.items():
37    compiled[name] = re.compile(pattern, flags)
38
39blacklisted_subjects = {}
40if os.path.exists(subject_filename):
41    subject_file = open(subject_filename, "r")
42    blacklisted_subjects = pickle.loads(subject_file.read())
43    subject_file.close()
44
45now = time.time()
46
47xml = xml.dom.minidom.parseString(sys.stdin.read())
48channel = xml.getElementsByTagName("rss")[0].getElementsByTagName("channel")[0]
49channeltitle = channel.getElementsByTagName("title")[0].firstChild.data
50group = compiled["group"].match(channeltitle).group(1)
51
52for item in channel.getElementsByTagName("item"):
53    # Check if subject matches blacklist or bad keyword
54    title = item.getElementsByTagName("title")[0].firstChild.data
55    if title in blacklisted_subjects or compiled["badkeywords"].search(title):
56        blacklisted_subjects[title] = now
57        channel.removeChild(item)
58        item.unlink()
59        continue
60
61    # Otherwise fetch the whole post
62    link = item.getElementsByTagName("link")[0].firstChild.data
63    postid = compiled["postid"].match(link).group(1)
64    request = urllib2.Request("http://groups.google.co.za/group/%s/msg/%s?dmode=source&output=gplain" % (group, postid), None, {"User-Agent": useragent})
65    response = urllib2.urlopen(request)
66    text = response.read()
67
68    # Do more filtering
69    newsgroups = compiled["newsgroups"].search(text).group(1)
70    crosspost_count = len(newsgroups.split(","))
71    author = compiled["author"].search(text).group(1)
72    body = compiled["body"].search(text).group(1)
73
74    if crosspost_count > crosspost_limit or compiled["badgroups"].search(newsgroups) or compiled["badauthors"].search(author) or compiled["badkeywords"].search(body):
75        blacklisted_subjects[title] = now
76        channel.removeChild(item)
77        item.unlink()
78        continue
79
80    # Put full body in feed
81    charset = compiled["charset"].search(text)
82    if charset:
83        body = body.decode(charset.group(1)).encode('utf-8')
84    item.getElementsByTagName("description")[0].firstChild.data = body
85
86# remove old subjects from blacklist
87for subject, lastseen in list(blacklisted_subjects.items()):
88    if now - lastseen > 2*30*24*60*60:
89        del(blacklisted_subjects[subject])
90# save blacklist
91subject_file = open(subject_filename, "w")
92subject_file.write(pickle.dumps(blacklisted_subjects))
93subject_file.close()
94
95print xml.toxml(encoding="UTF-8")
Note: See TracBrowser for help on using the browser.