iron-blogger/blogbot.py

140 lines
3.7 KiB
Python
Executable file

#!/usr/bin/python3
import yaml
import feedparser
import datetime
import sys
import os
import shutil
import re
from dateutil.parser import parse
import dateutil.tz as tz
import settings
from git import Repo
def parse_published(pub):
try:
return parse(pub).astimezone(tz.tzlocal()).replace(tzinfo=None)
except:
return parse(pub).replace(tzinfo=None)
def get_date(post):
for k in ('published', 'created', 'updated'):
if k in post:
return post[k]
def get_link(post):
return post.link
def get_title(post):
if 'title' in post:
return post.title
else:
return ''
def remove_html_tags(txt):
p = re.compile(r'<[^<]*?/?>')
return p.sub('', txt)
def remove_extra_spaces(txt):
p = re.compile(r'\s+')
return p.sub(' ', txt)
def create_extract(txt):
stxt = remove_extra_spaces(remove_html_tags(txt))
if len(stxt) < 250:
return stxt
if stxt.rfind('. ',200,250)>0:
return stxt[:stxt.rfind('. ',200,250)+1]+" [...]"
if stxt.rfind('! ',200,250)>0:
return stxt[:stxt.rfind('! ',200,250)+1]+" [...]"
if stxt.rfind('? ',200,250)>0:
return stxt[:stxt.rfind('? ',200,250)+1]+" [...]"
if stxt.rfind(', ',200,250)>0:
return stxt[:stxt.rfind(', ',200,250)+1]+" [...]"
if stxt.rfind(' ',200,250)>0:
return stxt[:stxt.rfind(' ',200,250)]+" [...]"
return stxt[:250]+"[...]"
def parse_feeds(weeks, username, blog):
feedparser.USER_AGENT = "IronBloggerBot/0.2 +http://ironblogger.de/"
uri = blog['feed']
print("Retreiving ", uri)
feed = feedparser.parse(uri)
if not feed.entries:
print("WARN: no entries for ", uri, file=sys.stderr)
for post in feed.entries:
date = parse_published(get_date(post))
if date < START:
continue
key = date.strftime("%Y-%m-%d")
weeks.setdefault(key, [])
post = dict(date=date,
title=get_title(post),
url=get_link(post),
username=username,
blogname=blog[0],
description=create_extract(post.description))
if post['url'] not in [p['url'] for p in weeks[key]]:
weeks[key].append(post)
# -- main
config=settings.load_settings()
if os.path.exists('data'):
shutil.rmtree('data')
gitrepo = Repo.clone_from('https://git.wazong.de/iron-blogger/test.git', 'data')
try:
with open('data/blogs.yaml') as f:
users = yaml.safe_load(f.read())
except FileNotFoundError:
users = []
print(users)
if not os.path.exists('data/out'):
os.makedirs('data/out')
try:
with open('data/out/report.yaml') as f:
log = yaml.safe_load(f.read())
except FileNotFoundError:
log = {}
# START = datetime.datetime.strptime(config['start_date'],'%Y/%m/%d')
START = datetime.datetime.now().replace(hour=0, minute=0, second=0, microsecond=0) - datetime.timedelta(days=7)
if len(sys.argv) > 1:
for username in sys.argv[1:]:
blogs = log.setdefault(username, {})
for l in users[username]['links']:
parse_feeds(log, username, l)
else:
for (username, u) in list(users.items()):
if 'end' in u:
enddate = datetime.datetime.strptime(u['end'],'%Y/%m/%d')
if enddate < datetime.datetime.now():
print("User inactive: ", username)
continue
for l in u['blogs']:
parse_feeds(log, username, l)
with open('data/out/report.yaml', 'w') as f:
yaml.safe_dump(log, f)
gitrepo.index.add(['out/report.yaml'])
with open('data/blogs.yaml', 'w') as f:
yaml.safe_dump(users, f)
gitrepo.index.add(['blogs.yaml'])
print(gitrepo.index.diff(gitrepo.head.commit))
# gitrepo.index.commit('autocommit')
# gitrepo.remotes.origin.push()