root/confluence/feedfilters/twitterfilter.py @ 587

Revision 587, 1.8 kB (checked in by confluence, 15 months ago)

noise

  • Property svn:executable set to *
Line 
1#!/usr/bin/env python
2
3# This filter removes posts which contain any of the specified keywords
4# from a Twitter feed.  It also makes urls and @usernames into clickable
5# links, and shortens item titles to poster usernames.
6#
7# Written by Adrianna Pinska
8# Licence: GPLv3
9
10import sys
11import re
12import xml.dom.minidom
13
14# Add crap you don't want to see to this list:
15noise = ["#postcrossing", "[Nn]et[Pp]rophet", "#[Aa]frica09"]
16
17noisepattern = re.compile("(%s)" % "|".join(noise))
18titlepattern = re.compile(r"^([^:]*):.*$", re.DOTALL)
19linkurls = re.compile(r"(https?://[^ ]*)")
20linknames = re.compile(r"@([a-zA-Z0-9_]+)")
21hashtags = re.compile(r"#([a-zA-Z0-9_]+)")
22statusidpattern = re.compile(r"http://twitter.com/.*/statuses/([0-9]*)")
23
24xml = xml.dom.minidom.parseString(sys.stdin.read())
25
26channel = xml.getElementsByTagName("rss")[0].getElementsByTagName("channel")[0]
27for item in channel.getElementsByTagName("item"):
28    description = item.getElementsByTagName("description")[0].firstChild
29    text = description.data
30
31    if noise and noisepattern.search(text):
32        channel.removeChild(item)
33        item.unlink()
34        continue
35
36    title = item.getElementsByTagName("title")[0].firstChild
37    title.data = titlepattern.sub(r"\1", title.data)
38    username = title.data.strip(":")
39
40    guid = item.getElementsByTagName("guid")[0].firstChild
41    statusid = statusidpattern.match(guid.data).group(1)
42
43    text = linkurls.sub(r"<a href='\1'>\1</a>", text)
44    text = linknames.sub(r"<a href='http://twitter.com/\1'>@\1</a>", text)
45    text = hashtags.sub(r"<a href='http://search.twitter.com/search?q=%23\1'>#\1</a>", text)
46    text += " <a href='http://twitter.com/home?status=@%s%%20&in_reply_to_status_id=%s&in_reply_to=%s'>Reply</a>" % (username, statusid, username)
47    description.data = text
48
49print xml.toxml(encoding="UTF-8")
Note: See TracBrowser for help on using the browser.