iron-blogger/blogbot.py

140 lines
3.7 KiB
Python
Raw Permalink Normal View History

2019-03-08 12:43:50 +00:00
#!/usr/bin/python3
2009-12-28 23:09:11 +00:00
import yaml
import feedparser
import datetime
import sys
import os
import shutil
import re
2009-12-28 23:09:11 +00:00
from dateutil.parser import parse
import dateutil.tz as tz
import settings
from git import Repo
2009-12-28 23:09:11 +00:00
def parse_published(pub):
try:
return parse(pub).astimezone(tz.tzlocal()).replace(tzinfo=None)
except:
return parse(pub).replace(tzinfo=None)
2009-12-28 23:09:11 +00:00
def get_date(post):
for k in ('published', 'created', 'updated'):
if k in post:
return post[k]
2009-12-28 23:09:11 +00:00
def get_link(post):
return post.link
2009-12-28 23:09:11 +00:00
def get_title(post):
if 'title' in post:
return post.title
else:
return ''
def remove_html_tags(txt):
p = re.compile(r'<[^<]*?/?>')
return p.sub('', txt)
def remove_extra_spaces(txt):
p = re.compile(r'\s+')
return p.sub(' ', txt)
def create_extract(txt):
stxt = remove_extra_spaces(remove_html_tags(txt))
if len(stxt) < 250:
return stxt
if stxt.rfind('. ',200,250)>0:
return stxt[:stxt.rfind('. ',200,250)+1]+" [...]"
if stxt.rfind('! ',200,250)>0:
return stxt[:stxt.rfind('! ',200,250)+1]+" [...]"
if stxt.rfind('? ',200,250)>0:
return stxt[:stxt.rfind('? ',200,250)+1]+" [...]"
if stxt.rfind(', ',200,250)>0:
return stxt[:stxt.rfind(', ',200,250)+1]+" [...]"
if stxt.rfind(' ',200,250)>0:
return stxt[:stxt.rfind(' ',200,250)]+" [...]"
return stxt[:250]+"[...]"
def parse_feeds(weeks, username, blog):
2019-03-08 12:43:50 +00:00
feedparser.USER_AGENT = "IronBloggerBot/0.2 +http://ironblogger.de/"
2024-03-05 21:51:03 +00:00
uri = blog['feed']
print("Retreiving ", uri)
2009-12-28 23:09:11 +00:00
feed = feedparser.parse(uri)
if not feed.entries:
2019-03-08 12:43:50 +00:00
print("WARN: no entries for ", uri, file=sys.stderr)
2009-12-28 23:09:11 +00:00
for post in feed.entries:
date = parse_published(get_date(post))
if date < START:
continue
key = date.strftime("%Y-%m-%d")
weeks.setdefault(key, [])
post = dict(date=date,
title=get_title(post),
url=get_link(post),
username=username,
blogname=blog[0],
description=create_extract(post.description))
if post['url'] not in [p['url'] for p in weeks[key]]:
weeks[key].append(post)
2024-03-05 21:51:03 +00:00
# -- main
config=settings.load_settings()
if os.path.exists('data'):
shutil.rmtree('data')
2024-03-05 21:51:03 +00:00
gitrepo = Repo.clone_from('https://git.wazong.de/iron-blogger/test.git', 'data')
try:
with open('data/blogs.yaml') as f:
users = yaml.safe_load(f.read())
except FileNotFoundError:
users = []
2024-03-05 21:51:03 +00:00
print(users)
if not os.path.exists('data/out'):
os.makedirs('data/out')
try:
2024-03-05 21:51:03 +00:00
with open('data/out/report.yaml') as f:
log = yaml.safe_load(f.read())
2024-03-05 21:51:03 +00:00
except FileNotFoundError:
log = {}
2024-03-05 21:51:03 +00:00
# START = datetime.datetime.strptime(config['start_date'],'%Y/%m/%d')
START = datetime.datetime.now().replace(hour=0, minute=0, second=0, microsecond=0) - datetime.timedelta(days=7)
2009-12-28 23:09:11 +00:00
2010-05-24 19:21:03 +00:00
if len(sys.argv) > 1:
for username in sys.argv[1:]:
blogs = log.setdefault(username, {})
2010-05-24 19:21:03 +00:00
for l in users[username]['links']:
parse_feeds(log, username, l)
2010-05-24 19:21:03 +00:00
else:
2019-03-08 12:43:50 +00:00
for (username, u) in list(users.items()):
if 'end' in u:
enddate = datetime.datetime.strptime(u['end'],'%Y/%m/%d')
if enddate < datetime.datetime.now():
2020-02-29 15:37:31 +00:00
print("User inactive: ", username)
continue
2024-03-05 21:51:03 +00:00
for l in u['blogs']:
parse_feeds(log, username, l)
2009-12-28 23:09:11 +00:00
2024-03-05 21:51:03 +00:00
with open('data/out/report.yaml', 'w') as f:
2009-12-28 23:09:11 +00:00
yaml.safe_dump(log, f)
2024-03-05 21:51:03 +00:00
gitrepo.index.add(['out/report.yaml'])
with open('data/blogs.yaml', 'w') as f:
yaml.safe_dump(users, f)
gitrepo.index.add(['blogs.yaml'])
print(gitrepo.index.diff(gitrepo.head.commit))
# gitrepo.index.commit('autocommit')
# gitrepo.remotes.origin.push()