Initial import scripts
This commit is contained in:
commit
04d130367c
|
@ -0,0 +1,37 @@
|
|||
from lxml import html
|
||||
import yaml
|
||||
import urllib2
|
||||
import urlparse
|
||||
|
||||
with open('bloggers.yml') as f:
|
||||
users = yaml.safe_load(f.read())
|
||||
|
||||
def fetch_links(url):
|
||||
tree = html.fromstring(urllib2.urlopen(url).read())
|
||||
links = tree.xpath(
|
||||
'//link[@rel="alternate"][contains(@type, "rss") or ' +
|
||||
'contains(@type, "atom") or contains(@type, "rdf")]')
|
||||
candidates = [l for l in links if
|
||||
'atom' in l.attrib['type'] and
|
||||
'comments' not in l.attrib['href'].lower() and
|
||||
'comments' not in l.attrib.get('title','')]
|
||||
if candidates:
|
||||
return candidates[0].attrib['href']
|
||||
return links[0].attrib['href']
|
||||
|
||||
for (name, u) in users.items():
|
||||
print "[%s]" % name
|
||||
for e in u['links']:
|
||||
(title, url) = e[0:2]
|
||||
print " - %s:" % title.strip()
|
||||
e[0] = e[0].strip()
|
||||
if len(e) < 3:
|
||||
e.append(None)
|
||||
link = fetch_links(url)
|
||||
if not link.startswith('http:'):
|
||||
link = urlparse.urljoin(url, link)
|
||||
print " %s" % (link,)
|
||||
e[2] = link
|
||||
|
||||
with open('bloggers.yml', 'w') as f:
|
||||
yaml.safe_dump(users, f)
|
|
@ -0,0 +1,15 @@
|
|||
#!usr/bin/python
|
||||
from lxml import html
|
||||
import yaml
|
||||
|
||||
tree = html.fromstring(open('/tmp/iron-blogger.html').read())
|
||||
|
||||
who = {}
|
||||
for tr in list(tree.xpath('//tr'))[1:]:
|
||||
username = str(tr.xpath('td[1]/tt/text()')[0])
|
||||
links = tr.xpath('td[2]/a')
|
||||
links = [(l.text, l.attrib['href']) for l in links]
|
||||
start = str(tr.xpath('td[3]/text()')[0]).strip()
|
||||
who[username] = dict(links=links, start=start)
|
||||
|
||||
print yaml.safe_dump(who)
|
Loading…
Reference in New Issue