#!/usr/bin/env python # This is a generic filter for selecting items in an rss feed on the basis of subelement blacklists and whitelists # # Written by Adrianna Pinska # Licence: GPLv3 import sys import re import xml.dom.minidom include = { "title": ["Ressentiment", "Pluto", "Vinland Saga", "Twin Spica", "Historie", "Full Metal Alchemist", "Billy Bat", "Cherry", "Cesare"], } exclude = { } whitelist = {} blacklist = {} for element, lst in include.items(): whitelist[element] = re.compile("(%s)" % "|".join(lst)) for element, lst in exclude.items(): blacklist[element] = re.compile("(%s)" % "|".join(lst)) elements_of_interest = set(whitelist.keys()+blacklist.keys()) xml = xml.dom.minidom.parseString(sys.stdin.read()) channel = xml.getElementsByTagName("rss")[0].getElementsByTagName("channel")[0] for item in channel.getElementsByTagName("item"): elements = {} for element in elements_of_interest: elements[element] = item.getElementsByTagName(element)[0].firstChild.data if whitelist: allowed = False for element, regex in whitelist.items(): if regex.search(elements[element]): allowed = True break if not allowed: channel.removeChild(item) item.unlink() continue for element, regex in blacklist.items(): if regex.search(elements[element]): channel.removeChild(item) item.unlink() continue text = xml.toxml(encoding="UTF-8") text = re.sub(" *\n","\n",text) text = re.sub("\n+","\n",text) print text