iron-blogger/atom.py

from lxml import html
import yaml
import urllib2
import urlparse

with open('bloggers.yml') as f:
    users = yaml.safe_load(f.read())

def fetch_links(url):
    tree = html.fromstring(urllib2.urlopen(url).read())
    links = tree.xpath(
        '//link[@rel="alternate"][contains(@type, "rss") or ' +
        'contains(@type, "atom") or contains(@type, "rdf")]')
    candidates = [l for l in links if
                  'atom' in l.attrib['type'] and
                  'comments' not in l.attrib['href'].lower() and
                  'comments' not in l.attrib.get('title','')]
    if candidates:
        return candidates[0].attrib['href']
    return links[0].attrib['href']

for (name, u) in users.items():
    print "[%s]" % name
    for e in u['links']:
        (title, url) = e[0:2]
        print " - %s:" % title.strip()
        e[0] = e[0].strip()
        if len(e) < 3:
            e.append(None)
        link = fetch_links(url)
        if not link.startswith('http:'):
            link = urlparse.urljoin(url, link)
        print "   %s" % (link,)
        e[2] = link

with open('bloggers.yml', 'w') as f:
    yaml.safe_dump(users, f)
Initial import scripts 2009-12-28 22:06:06 +00:00			`from lxml import html`
			`import yaml`
			`import urllib2`
			`import urlparse`

			`with open('bloggers.yml') as f:`
			`users = yaml.safe_load(f.read())`

			`def fetch_links(url):`
			`tree = html.fromstring(urllib2.urlopen(url).read())`
			`links = tree.xpath(`
			`'//link[@rel="alternate"][contains(@type, "rss") or ' +`
			`'contains(@type, "atom") or contains(@type, "rdf")]')`
			`candidates = [l for l in links if`
			`'atom' in l.attrib['type'] and`
			`'comments' not in l.attrib['href'].lower() and`
			`'comments' not in l.attrib.get('title','')]`
			`if candidates:`
			`return candidates[0].attrib['href']`
			`return links[0].attrib['href']`

			`for (name, u) in users.items():`
			`print "[%s]" % name`
			`for e in u['links']:`
			`(title, url) = e[0:2]`
			`print " - %s:" % title.strip()`
			`e[0] = e[0].strip()`
			`if len(e) < 3:`
			`e.append(None)`
			`link = fetch_links(url)`
			`if not link.startswith('http:'):`
			`link = urlparse.urljoin(url, link)`
			`print " %s" % (link,)`
			`e[2] = link`

			`with open('bloggers.yml', 'w') as f:`
			`yaml.safe_dump(users, f)`