iron-blogger/import-feeds.py

#!/usr/bin/python3
from lxml import html
import yaml
import sys
import urllib.request
import urllib.parse

with open('bloggers.yml') as f:
    users = yaml.safe_load(f.read())

def fetch_links(url):
    print("Looking for feeds in %s" % (url,), file=sys.stderr)
    try:
        tree = html.document_fromstring(urllib.request.urlopen(url).read())
        links = tree.xpath(
             '//link[@rel="alternate"][contains(@type, "rss") or ' +
             'contains(@type, "atom") or contains(@type, "rdf")]')
        candidates = [l for l in links if
                  'atom' in l.attrib['type'] and
                  'comments' not in l.attrib['href'].lower() and
                  'comments' not in l.attrib.get('title','')]
    except:
        candidates = []
        links = []
    if candidates:
        return candidates[0].attrib['href']
    elif links:
        return links[0].attrib['href']
    else:
        print("No link found for %s" % (url,), file=sys.stderr)
        return None

for (name, u) in list(users.items()):
    print("Processing user %s" % (name,), file=sys.stderr)
    for e in u['links']:
        (title, url) = e[1:3]
        try:
    	    e[1] = e[1].strip()
        except:
    	    e[1] = e[1]
        if len(e) == 4:
            continue
        link = fetch_links(url)
        if link:
            if not link.startswith('http:'):
                link = urllib.parse.urljoin(url, link)
            e.append(link)

with open('bloggers.yml', 'w') as f:
    yaml.safe_dump(users, f)