diff --git a/.gitignore b/.gitignore index f11c624..0c2dad4 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ *.pyc settings.cfg out/ +data/ \ No newline at end of file diff --git a/blogbot.py b/blogbot.py index 8c959b0..9607db4 100755 --- a/blogbot.py +++ b/blogbot.py @@ -4,24 +4,12 @@ import feedparser import datetime import sys import os +import shutil +import re from dateutil.parser import parse import dateutil.tz as tz import settings - -config=settings.load_settings() - -with open('bloggers.yml') as f: - users = yaml.safe_load(f.read()) - -if not os.path.exists('out'): - os.makedirs('out') -try: - with open('out/report.yml') as f: - log = yaml.safe_load(f.read()) -except IOError: - log = {} - -START = datetime.datetime.strptime(config['start_date'],'%Y/%m/%d') +from git import Repo def parse_published(pub): try: @@ -37,8 +25,40 @@ def get_date(post): def get_link(post): return post.link -def parse_feeds(weeks, uri): +def get_title(post): + if 'title' in post: + return post.title + else: + return '' + +def remove_html_tags(txt): + p = re.compile(r'<[^<]*?/?>') + return p.sub('', txt) + +def remove_extra_spaces(txt): + p = re.compile(r'\s+') + return p.sub(' ', txt) + +def create_extract(txt): + stxt = remove_extra_spaces(remove_html_tags(txt)) + if len(stxt) < 250: + return stxt + if stxt.rfind('. ',200,250)>0: + return stxt[:stxt.rfind('. ',200,250)+1]+" [...]" + if stxt.rfind('! ',200,250)>0: + return stxt[:stxt.rfind('! ',200,250)+1]+" [...]" + if stxt.rfind('? ',200,250)>0: + return stxt[:stxt.rfind('? ',200,250)+1]+" [...]" + if stxt.rfind(', ',200,250)>0: + return stxt[:stxt.rfind(', ',200,250)+1]+" [...]" + if stxt.rfind(' ',200,250)>0: + return stxt[:stxt.rfind(' ',200,250)]+" [...]" + return stxt[:250]+"[...]" + + +def parse_feeds(weeks, username, blog): feedparser.USER_AGENT = "IronBloggerBot/0.2 +http://ironblogger.de/" + uri = blog[3] feed = feedparser.parse(uri) if not feed.entries: @@ -48,28 +68,45 @@ def parse_feeds(weeks, uri): if date < START: continue - wn = int ( (date - START).days / 7 ) - while len(weeks) <= wn: - weeks.append([]) + key = date.strftime("%Y-%m-%d") - if 'title' in post: - post = dict(date=date, - title=post.title, - url=get_link(post)) - if 'title' not in post: - post = dict(date=date, - title="", - url=get_link(post)) - if post['url'] not in [p['url'] for p in weeks[wn]]: - weeks[wn].append(post) + weeks.setdefault(key, []) + + post = dict(date=date, + title=get_title(post), + url=get_link(post), + username=username, + blogname=blog[0], + description=create_extract(post.description)) + if post['url'] not in [p['url'] for p in weeks[key]]: + weeks[key].append(post) + +config=settings.load_settings() + +if os.path.exists('data'): + shutil.rmtree('data') + +gitrepo = Repo.clone_from('https://git.wazong.de/iron-blogger/content-society.git', 'data') + +with open('data/bloggers.yml') as f: + users = yaml.safe_load(f.read()) + +if not os.path.exists('data/out'): + os.makedirs('data/out') +try: + with open('data/out/report.yml') as f: + log = yaml.safe_load(f.read()) +except IOError: + log = {} + +START = datetime.datetime.strptime(config['start_date'],'%Y/%m/%d') if len(sys.argv) > 1: for username in sys.argv[1:]: blogs = log.setdefault(username, {}) for l in users[username]['links']: - weeks = blogs.setdefault(l[0], []) - parse_feeds(weeks, l[3]) + parse_feeds(log, username, l) else: for (username, u) in list(users.items()): if 'end' in u: @@ -77,10 +114,8 @@ else: if enddate < datetime.datetime.now(): print("User inactive: ", username) continue - blogs = log.setdefault(username, {}) for l in u['links']: - weeks = blogs.setdefault(l[0], []) - parse_feeds(weeks, l[3]) + parse_feeds(log, username, l) -with open('out/report.yml', 'w') as f: +with open('data/out/report.yml', 'w') as f: yaml.safe_dump(log, f) diff --git a/requirements.txt b/requirements.txt index 292df66..ed45c88 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ pyyaml feedparser -python-dateutil \ No newline at end of file +python-dateutil +GitPython \ No newline at end of file