Start: load/save data dynamically from git (WIP)

This commit is contained in:
Thomas Renger 2023-12-28 23:05:28 +01:00
parent 512619b214
commit b091d20bf3
3 changed files with 73 additions and 36 deletions

1
.gitignore vendored
View File

@ -2,3 +2,4 @@
*.pyc *.pyc
settings.cfg settings.cfg
out/ out/
data/

View File

@ -4,24 +4,12 @@ import feedparser
import datetime import datetime
import sys import sys
import os import os
import shutil
import re
from dateutil.parser import parse from dateutil.parser import parse
import dateutil.tz as tz import dateutil.tz as tz
import settings import settings
from git import Repo
config=settings.load_settings()
with open('bloggers.yml') as f:
users = yaml.safe_load(f.read())
if not os.path.exists('out'):
os.makedirs('out')
try:
with open('out/report.yml') as f:
log = yaml.safe_load(f.read())
except IOError:
log = {}
START = datetime.datetime.strptime(config['start_date'],'%Y/%m/%d')
def parse_published(pub): def parse_published(pub):
try: try:
@ -37,8 +25,40 @@ def get_date(post):
def get_link(post): def get_link(post):
return post.link return post.link
def parse_feeds(weeks, uri): def get_title(post):
if 'title' in post:
return post.title
else:
return ''
def remove_html_tags(txt):
p = re.compile(r'<[^<]*?/?>')
return p.sub('', txt)
def remove_extra_spaces(txt):
p = re.compile(r'\s+')
return p.sub(' ', txt)
def create_extract(txt):
stxt = remove_extra_spaces(remove_html_tags(txt))
if len(stxt) < 250:
return stxt
if stxt.rfind('. ',200,250)>0:
return stxt[:stxt.rfind('. ',200,250)+1]+" [...]"
if stxt.rfind('! ',200,250)>0:
return stxt[:stxt.rfind('! ',200,250)+1]+" [...]"
if stxt.rfind('? ',200,250)>0:
return stxt[:stxt.rfind('? ',200,250)+1]+" [...]"
if stxt.rfind(', ',200,250)>0:
return stxt[:stxt.rfind(', ',200,250)+1]+" [...]"
if stxt.rfind(' ',200,250)>0:
return stxt[:stxt.rfind(' ',200,250)]+" [...]"
return stxt[:250]+"[...]"
def parse_feeds(weeks, username, blog):
feedparser.USER_AGENT = "IronBloggerBot/0.2 +http://ironblogger.de/" feedparser.USER_AGENT = "IronBloggerBot/0.2 +http://ironblogger.de/"
uri = blog[3]
feed = feedparser.parse(uri) feed = feedparser.parse(uri)
if not feed.entries: if not feed.entries:
@ -48,28 +68,45 @@ def parse_feeds(weeks, uri):
if date < START: if date < START:
continue continue
wn = int ( (date - START).days / 7 )
while len(weeks) <= wn: key = date.strftime("%Y-%m-%d")
weeks.append([])
if 'title' in post: weeks.setdefault(key, [])
post = dict(date=date,
title=post.title, post = dict(date=date,
url=get_link(post)) title=get_title(post),
if 'title' not in post: url=get_link(post),
post = dict(date=date, username=username,
title="", blogname=blog[0],
url=get_link(post)) description=create_extract(post.description))
if post['url'] not in [p['url'] for p in weeks[wn]]: if post['url'] not in [p['url'] for p in weeks[key]]:
weeks[wn].append(post) weeks[key].append(post)
config=settings.load_settings()
if os.path.exists('data'):
shutil.rmtree('data')
gitrepo = Repo.clone_from('https://git.wazong.de/iron-blogger/content-society.git', 'data')
with open('data/bloggers.yml') as f:
users = yaml.safe_load(f.read())
if not os.path.exists('data/out'):
os.makedirs('data/out')
try:
with open('data/out/report.yml') as f:
log = yaml.safe_load(f.read())
except IOError:
log = {}
START = datetime.datetime.strptime(config['start_date'],'%Y/%m/%d')
if len(sys.argv) > 1: if len(sys.argv) > 1:
for username in sys.argv[1:]: for username in sys.argv[1:]:
blogs = log.setdefault(username, {}) blogs = log.setdefault(username, {})
for l in users[username]['links']: for l in users[username]['links']:
weeks = blogs.setdefault(l[0], []) parse_feeds(log, username, l)
parse_feeds(weeks, l[3])
else: else:
for (username, u) in list(users.items()): for (username, u) in list(users.items()):
if 'end' in u: if 'end' in u:
@ -77,10 +114,8 @@ else:
if enddate < datetime.datetime.now(): if enddate < datetime.datetime.now():
print("User inactive: ", username) print("User inactive: ", username)
continue continue
blogs = log.setdefault(username, {})
for l in u['links']: for l in u['links']:
weeks = blogs.setdefault(l[0], []) parse_feeds(log, username, l)
parse_feeds(weeks, l[3])
with open('out/report.yml', 'w') as f: with open('data/out/report.yml', 'w') as f:
yaml.safe_dump(log, f) yaml.safe_dump(log, f)

View File

@ -1,3 +1,4 @@
pyyaml pyyaml
feedparser feedparser
python-dateutil python-dateutil
GitPython