#!/usr/bin/python3 import yaml import feedparser import datetime import sys import os import shutil import re from dateutil.parser import parse import dateutil.tz as tz import settings from git import Repo def parse_published(pub): try: return parse(pub).astimezone(tz.tzlocal()).replace(tzinfo=None) except: return parse(pub).replace(tzinfo=None) def get_date(post): for k in ('published', 'created', 'updated'): if k in post: return post[k] def get_link(post): return post.link def get_title(post): if 'title' in post: return post.title else: return '' def remove_html_tags(txt): p = re.compile(r'<[^<]*?/?>') return p.sub('', txt) def remove_extra_spaces(txt): p = re.compile(r'\s+') return p.sub(' ', txt) def create_extract(txt): stxt = remove_extra_spaces(remove_html_tags(txt)) if len(stxt) < 250: return stxt if stxt.rfind('. ',200,250)>0: return stxt[:stxt.rfind('. ',200,250)+1]+" [...]" if stxt.rfind('! ',200,250)>0: return stxt[:stxt.rfind('! ',200,250)+1]+" [...]" if stxt.rfind('? ',200,250)>0: return stxt[:stxt.rfind('? ',200,250)+1]+" [...]" if stxt.rfind(', ',200,250)>0: return stxt[:stxt.rfind(', ',200,250)+1]+" [...]" if stxt.rfind(' ',200,250)>0: return stxt[:stxt.rfind(' ',200,250)]+" [...]" return stxt[:250]+"[...]" def parse_feeds(weeks, username, blog): feedparser.USER_AGENT = "IronBloggerBot/0.2 +http://ironblogger.de/" uri = blog['feed'] print("Retreiving ", uri) feed = feedparser.parse(uri) if not feed.entries: print("WARN: no entries for ", uri, file=sys.stderr) for post in feed.entries: date = parse_published(get_date(post)) if date < START: continue key = date.strftime("%Y-%m-%d") weeks.setdefault(key, []) post = dict(date=date, title=get_title(post), url=get_link(post), username=username, blogname=blog[0], description=create_extract(post.description)) if post['url'] not in [p['url'] for p in weeks[key]]: weeks[key].append(post) # -- main config=settings.load_settings() if os.path.exists('data'): shutil.rmtree('data') gitrepo = Repo.clone_from('https://git.wazong.de/iron-blogger/test.git', 'data') try: with open('data/blogs.yaml') as f: users = yaml.safe_load(f.read()) except FileNotFoundError: users = [] print(users) if not os.path.exists('data/out'): os.makedirs('data/out') try: with open('data/out/report.yaml') as f: log = yaml.safe_load(f.read()) except FileNotFoundError: log = {} # START = datetime.datetime.strptime(config['start_date'],'%Y/%m/%d') START = datetime.datetime.now().replace(hour=0, minute=0, second=0, microsecond=0) - datetime.timedelta(days=7) if len(sys.argv) > 1: for username in sys.argv[1:]: blogs = log.setdefault(username, {}) for l in users[username]['links']: parse_feeds(log, username, l) else: for (username, u) in list(users.items()): if 'end' in u: enddate = datetime.datetime.strptime(u['end'],'%Y/%m/%d') if enddate < datetime.datetime.now(): print("User inactive: ", username) continue for l in u['blogs']: parse_feeds(log, username, l) with open('data/out/report.yaml', 'w') as f: yaml.safe_dump(log, f) gitrepo.index.add(['out/report.yaml']) with open('data/blogs.yaml', 'w') as f: yaml.safe_dump(users, f) gitrepo.index.add(['blogs.yaml']) print(gitrepo.index.diff(gitrepo.head.commit)) # gitrepo.index.commit('autocommit') # gitrepo.remotes.origin.push()