Initial import scripts

2009-12-28 17:06:06 -05:00 · 2009-12-28 17:06:06 -05:00 · 04d130367c
commit 04d130367c
2 changed files with 52 additions and 0 deletions
--- a/atom.py
+++ b/atom.py
@ -0,0 +1,37 @@
+from lxml import html
+import yaml
+import urllib2
+import urlparse
+
+with open('bloggers.yml') as f:
+    users = yaml.safe_load(f.read())
+
+def fetch_links(url):
+    tree = html.fromstring(urllib2.urlopen(url).read())
+    links = tree.xpath(
+        '//link[@rel="alternate"][contains(@type, "rss") or ' +
+        'contains(@type, "atom") or contains(@type, "rdf")]')
+    candidates = [l for l in links if
+                  'atom' in l.attrib['type'] and
+                  'comments' not in l.attrib['href'].lower() and
+                  'comments' not in l.attrib.get('title','')]
+    if candidates:
+        return candidates[0].attrib['href']
+    return links[0].attrib['href']
+
+for (name, u) in users.items():
+    print "[%s]" % name
+    for e in u['links']:
+        (title, url) = e[0:2]
+        print " - %s:" % title.strip()
+        e[0] = e[0].strip()
+        if len(e) < 3:
+            e.append(None)
+        link = fetch_links(url)
+        if not link.startswith('http:'):
+            link = urlparse.urljoin(url, link)
+        print "   %s" % (link,)
+        e[2] = link
+
+with open('bloggers.yml', 'w') as f:
+    yaml.safe_dump(users, f)
--- a/import.py
+++ b/import.py
@ -0,0 +1,15 @@
+#!usr/bin/python
+from lxml import html
+import yaml
+
+tree = html.fromstring(open('/tmp/iron-blogger.html').read())
+
+who = {}
+for tr in list(tree.xpath('//tr'))[1:]:
+    username = str(tr.xpath('td[1]/tt/text()')[0])
+    links = tr.xpath('td[2]/a')
+    links = [(l.text, l.attrib['href']) for l in links]
+    start = str(tr.xpath('td[3]/text()')[0]).strip()
+    who[username] = dict(links=links, start=start)
+
+print yaml.safe_dump(who)