Compare commits

...

3 Commits

Author SHA1 Message Date
Thomas Renger b091d20bf3 Start: load/save data dynamically from git (WIP) 2023-12-28 23:05:28 +01:00
Thomas Renger 512619b214 tYp0 2023-12-28 23:03:06 +01:00
Thomas Renger 3c4eeed4c0 Rename scan-feeds to blogbot 2023-12-28 23:00:34 +01:00
5 changed files with 125 additions and 88 deletions

1
.gitignore vendored
View File

@ -2,3 +2,4 @@
*.pyc
settings.cfg
out/
data/

View File

@ -6,4 +6,4 @@ RUN pip install --no-cache-dir -r requirements.txt
COPY . .
CMD [ "python", "./scan-feeds.py" ]
CMD [ "python", "./blogbot.py" ]

121
blogbot.py Executable file
View File

@ -0,0 +1,121 @@
#!/usr/bin/python3
import yaml
import feedparser
import datetime
import sys
import os
import shutil
import re
from dateutil.parser import parse
import dateutil.tz as tz
import settings
from git import Repo
def parse_published(pub):
try:
return parse(pub).astimezone(tz.tzlocal()).replace(tzinfo=None)
except:
return parse(pub).replace(tzinfo=None)
def get_date(post):
for k in ('published', 'created', 'updated'):
if k in post:
return post[k]
def get_link(post):
return post.link
def get_title(post):
if 'title' in post:
return post.title
else:
return ''
def remove_html_tags(txt):
p = re.compile(r'<[^<]*?/?>')
return p.sub('', txt)
def remove_extra_spaces(txt):
p = re.compile(r'\s+')
return p.sub(' ', txt)
def create_extract(txt):
stxt = remove_extra_spaces(remove_html_tags(txt))
if len(stxt) < 250:
return stxt
if stxt.rfind('. ',200,250)>0:
return stxt[:stxt.rfind('. ',200,250)+1]+" [...]"
if stxt.rfind('! ',200,250)>0:
return stxt[:stxt.rfind('! ',200,250)+1]+" [...]"
if stxt.rfind('? ',200,250)>0:
return stxt[:stxt.rfind('? ',200,250)+1]+" [...]"
if stxt.rfind(', ',200,250)>0:
return stxt[:stxt.rfind(', ',200,250)+1]+" [...]"
if stxt.rfind(' ',200,250)>0:
return stxt[:stxt.rfind(' ',200,250)]+" [...]"
return stxt[:250]+"[...]"
def parse_feeds(weeks, username, blog):
feedparser.USER_AGENT = "IronBloggerBot/0.2 +http://ironblogger.de/"
uri = blog[3]
feed = feedparser.parse(uri)
if not feed.entries:
print("WARN: no entries for ", uri, file=sys.stderr)
for post in feed.entries:
date = parse_published(get_date(post))
if date < START:
continue
key = date.strftime("%Y-%m-%d")
weeks.setdefault(key, [])
post = dict(date=date,
title=get_title(post),
url=get_link(post),
username=username,
blogname=blog[0],
description=create_extract(post.description))
if post['url'] not in [p['url'] for p in weeks[key]]:
weeks[key].append(post)
config=settings.load_settings()
if os.path.exists('data'):
shutil.rmtree('data')
gitrepo = Repo.clone_from('https://git.wazong.de/iron-blogger/content-society.git', 'data')
with open('data/bloggers.yml') as f:
users = yaml.safe_load(f.read())
if not os.path.exists('data/out'):
os.makedirs('data/out')
try:
with open('data/out/report.yml') as f:
log = yaml.safe_load(f.read())
except IOError:
log = {}
START = datetime.datetime.strptime(config['start_date'],'%Y/%m/%d')
if len(sys.argv) > 1:
for username in sys.argv[1:]:
blogs = log.setdefault(username, {})
for l in users[username]['links']:
parse_feeds(log, username, l)
else:
for (username, u) in list(users.items()):
if 'end' in u:
enddate = datetime.datetime.strptime(u['end'],'%Y/%m/%d')
if enddate < datetime.datetime.now():
print("User inactive: ", username)
continue
for l in u['links']:
parse_feeds(log, username, l)
with open('data/out/report.yml', 'w') as f:
yaml.safe_dump(log, f)

View File

@ -1,3 +1,4 @@
pyyaml
feedparser
python-dateutil
python-dateutil
GitPython

View File

@ -1,86 +0,0 @@
#!/usr/bin/python3
import yaml
import feedparser
import datetime
import sys
import os
from dateutil.parser import parse
import dateutil.tz as tz
import settings
config=settings.load_settings()
with open('bloggers.yml') as f:
users = yaml.safe_load(f.read())
if not os.path.exists('out'):
os.makedirs('out')
try:
with open('out/report.yml') as f:
log = yaml.safe_load(f.read())
except IOError:
log = {}
START = datetime.datetime.strptime(config['start_date'],'%Y/%m/%d')
def parse_published(pub):
try:
return parse(pub).astimezone(tz.tzlocal()).replace(tzinfo=None)
except:
return parse(pub).replace(tzinfo=None)
def get_date(post):
for k in ('published', 'created', 'updated'):
if k in post:
return post[k]
def get_link(post):
return post.link
def parse_feeds(weeks, uri):
feedparser.USER_AGENT = "IronBloggerBot/0.2 +http://ironblogger.de/"
feed = feedparser.parse(uri)
if not feed.entries:
print("WARN: no entries for ", uri, file=sys.stderr)
for post in feed.entries:
date = parse_published(get_date(post))
if date < START:
continue
wn = int ( (date - START).days / 7 )
while len(weeks) <= wn:
weeks.append([])
if 'title' in post:
post = dict(date=date,
title=post.title,
url=get_link(post))
if 'title' not in post:
post = dict(date=date,
title="",
url=get_link(post))
if post['url'] not in [p['url'] for p in weeks[wn]]:
weeks[wn].append(post)
if len(sys.argv) > 1:
for username in sys.argv[1:]:
blogs = log.setdefault(username, {})
for l in users[username]['links']:
weeks = blogs.setdefault(l[0], [])
parse_feeds(weeks, l[3])
else:
for (username, u) in list(users.items()):
if 'end' in u:
enddate = datetime.datetime.strptime(u['end'],'%Y/%m/%d')
if enddate < datetime.datetime.now():
print("User inactive: ", username)
continue
blogs = log.setdefault(username, {})
for l in u['links']:
weeks = blogs.setdefault(l[0], [])
parse_feeds(weeks, l[3])
with open('out/report.yml', 'w') as f:
yaml.safe_dump(log, f)