iron-blogger/import-feeds.py

#!/usr/bin/python
from lxml import html
import yaml
import sys
import urllib2
import urlparse

with open('bloggers.yml') as f:
    users = yaml.safe_load(f.read())

def fetch_links(url):
    tree = html.fromstring(urllib2.urlopen(url).read())
    links = tree.xpath(
        '//link[@rel="alternate"][contains(@type, "rss") or ' +
        'contains(@type, "atom") or contains(@type, "rdf")]')
    candidates = [l for l in links if
                  'atom' in l.attrib['type'] and
                  'comments' not in l.attrib['href'].lower() and
                  'comments' not in l.attrib.get('title','')]
    if candidates:
        return candidates[0].attrib['href']
    elif links:
        return links[0].attrib['href']
    else:
        print >>sys.stderr, "No link found for %s" % (url,)
        return None

for (name, u) in users.items():
    for e in u['links']:
        (title, url) = e[0:2]
        e[0] = e[0].strip()
        if len(e) == 3:
            continue
        link = fetch_links(url)
        if link:
            if not link.startswith('http:'):
                link = urlparse.urljoin(url, link)
            e.append(link)

with open('bloggers.yml', 'w') as f:
    yaml.safe_dump(users, f)
Make all the scripts executable. 2009-12-29 18:37:40 +00:00			`#!/usr/bin/python`
Initial import scripts 2009-12-28 22:06:06 +00:00			`from lxml import html`
			`import yaml`
import-feeds: Handle blogs without <link> tags. 2010-02-09 22:23:57 +00:00			`import sys`
Initial import scripts 2009-12-28 22:06:06 +00:00			`import urllib2`
			`import urlparse`

			`with open('bloggers.yml') as f:`
			`users = yaml.safe_load(f.read())`

			`def fetch_links(url):`
			`tree = html.fromstring(urllib2.urlopen(url).read())`
			`links = tree.xpath(`
			`'//link[@rel="alternate"][contains(@type, "rss") or ' +`
			`'contains(@type, "atom") or contains(@type, "rdf")]')`
			`candidates = [l for l in links if`
			`'atom' in l.attrib['type'] and`
			`'comments' not in l.attrib['href'].lower() and`
			`'comments' not in l.attrib.get('title','')]`
			`if candidates:`
			`return candidates[0].attrib['href']`
import-feeds: Handle blogs without <link> tags. 2010-02-09 22:23:57 +00:00			`elif links:`
			`return links[0].attrib['href']`
			`else:`
			`print >>sys.stderr, "No link found for %s" % (url,)`
			`return None`
Initial import scripts 2009-12-28 22:06:06 +00:00
			`for (name, u) in users.items():`
			`for e in u['links']:`
			`(title, url) = e[0:2]`
			`e[0] = e[0].strip()`
rename 2009-12-28 22:37:17 +00:00			`if len(e) == 3:`
			`continue`
Initial import scripts 2009-12-28 22:06:06 +00:00			`link = fetch_links(url)`
import-feeds: Handle blogs without <link> tags. 2010-02-09 22:23:57 +00:00			`if link:`
			`if not link.startswith('http:'):`
			`link = urlparse.urljoin(url, link)`
			`e.append(link)`
Initial import scripts 2009-12-28 22:06:06 +00:00
			`with open('bloggers.yml', 'w') as f:`
			`yaml.safe_dump(users, f)`