Start: load/save data dynamically from git (WIP)

This commit is contained in:
Thomas Renger 2023-12-28 23:05:28 +01:00
parent 512619b214
commit b091d20bf3
3 changed files with 73 additions and 36 deletions

1
.gitignore vendored
View File

@ -2,3 +2,4 @@
*.pyc
settings.cfg
out/
data/

View File

@ -4,24 +4,12 @@ import feedparser
import datetime
import sys
import os
import shutil
import re
from dateutil.parser import parse
import dateutil.tz as tz
import settings
config=settings.load_settings()
with open('bloggers.yml') as f:
users = yaml.safe_load(f.read())
if not os.path.exists('out'):
os.makedirs('out')
try:
with open('out/report.yml') as f:
log = yaml.safe_load(f.read())
except IOError:
log = {}
START = datetime.datetime.strptime(config['start_date'],'%Y/%m/%d')
from git import Repo
def parse_published(pub):
try:
@ -37,8 +25,40 @@ def get_date(post):
def get_link(post):
return post.link
def parse_feeds(weeks, uri):
def get_title(post):
if 'title' in post:
return post.title
else:
return ''
def remove_html_tags(txt):
p = re.compile(r'<[^<]*?/?>')
return p.sub('', txt)
def remove_extra_spaces(txt):
p = re.compile(r'\s+')
return p.sub(' ', txt)
def create_extract(txt):
stxt = remove_extra_spaces(remove_html_tags(txt))
if len(stxt) < 250:
return stxt
if stxt.rfind('. ',200,250)>0:
return stxt[:stxt.rfind('. ',200,250)+1]+" [...]"
if stxt.rfind('! ',200,250)>0:
return stxt[:stxt.rfind('! ',200,250)+1]+" [...]"
if stxt.rfind('? ',200,250)>0:
return stxt[:stxt.rfind('? ',200,250)+1]+" [...]"
if stxt.rfind(', ',200,250)>0:
return stxt[:stxt.rfind(', ',200,250)+1]+" [...]"
if stxt.rfind(' ',200,250)>0:
return stxt[:stxt.rfind(' ',200,250)]+" [...]"
return stxt[:250]+"[...]"
def parse_feeds(weeks, username, blog):
feedparser.USER_AGENT = "IronBloggerBot/0.2 +http://ironblogger.de/"
uri = blog[3]
feed = feedparser.parse(uri)
if not feed.entries:
@ -48,28 +68,45 @@ def parse_feeds(weeks, uri):
if date < START:
continue
wn = int ( (date - START).days / 7 )
while len(weeks) <= wn:
weeks.append([])
key = date.strftime("%Y-%m-%d")
if 'title' in post:
post = dict(date=date,
title=post.title,
url=get_link(post))
if 'title' not in post:
post = dict(date=date,
title="",
url=get_link(post))
if post['url'] not in [p['url'] for p in weeks[wn]]:
weeks[wn].append(post)
weeks.setdefault(key, [])
post = dict(date=date,
title=get_title(post),
url=get_link(post),
username=username,
blogname=blog[0],
description=create_extract(post.description))
if post['url'] not in [p['url'] for p in weeks[key]]:
weeks[key].append(post)
config=settings.load_settings()
if os.path.exists('data'):
shutil.rmtree('data')
gitrepo = Repo.clone_from('https://git.wazong.de/iron-blogger/content-society.git', 'data')
with open('data/bloggers.yml') as f:
users = yaml.safe_load(f.read())
if not os.path.exists('data/out'):
os.makedirs('data/out')
try:
with open('data/out/report.yml') as f:
log = yaml.safe_load(f.read())
except IOError:
log = {}
START = datetime.datetime.strptime(config['start_date'],'%Y/%m/%d')
if len(sys.argv) > 1:
for username in sys.argv[1:]:
blogs = log.setdefault(username, {})
for l in users[username]['links']:
weeks = blogs.setdefault(l[0], [])
parse_feeds(weeks, l[3])
parse_feeds(log, username, l)
else:
for (username, u) in list(users.items()):
if 'end' in u:
@ -77,10 +114,8 @@ else:
if enddate < datetime.datetime.now():
print("User inactive: ", username)
continue
blogs = log.setdefault(username, {})
for l in u['links']:
weeks = blogs.setdefault(l[0], [])
parse_feeds(weeks, l[3])
parse_feeds(log, username, l)
with open('out/report.yml', 'w') as f:
with open('data/out/report.yml', 'w') as f:
yaml.safe_dump(log, f)

View File

@ -1,3 +1,4 @@
pyyaml
feedparser
python-dateutil
python-dateutil
GitPython