Compare commits
3 Commits
cbfd679f1c
...
b091d20bf3
Author | SHA1 | Date |
---|---|---|
Thomas Renger | b091d20bf3 | |
Thomas Renger | 512619b214 | |
Thomas Renger | 3c4eeed4c0 |
|
@ -2,3 +2,4 @@
|
||||||
*.pyc
|
*.pyc
|
||||||
settings.cfg
|
settings.cfg
|
||||||
out/
|
out/
|
||||||
|
data/
|
|
@ -6,4 +6,4 @@ RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
CMD [ "python", "./scan-feeds.py" ]
|
CMD [ "python", "./blogbot.py" ]
|
|
@ -0,0 +1,121 @@
|
||||||
|
#!/usr/bin/python3
|
||||||
|
import yaml
|
||||||
|
import feedparser
|
||||||
|
import datetime
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import re
|
||||||
|
from dateutil.parser import parse
|
||||||
|
import dateutil.tz as tz
|
||||||
|
import settings
|
||||||
|
from git import Repo
|
||||||
|
|
||||||
|
def parse_published(pub):
|
||||||
|
try:
|
||||||
|
return parse(pub).astimezone(tz.tzlocal()).replace(tzinfo=None)
|
||||||
|
except:
|
||||||
|
return parse(pub).replace(tzinfo=None)
|
||||||
|
|
||||||
|
def get_date(post):
|
||||||
|
for k in ('published', 'created', 'updated'):
|
||||||
|
if k in post:
|
||||||
|
return post[k]
|
||||||
|
|
||||||
|
def get_link(post):
|
||||||
|
return post.link
|
||||||
|
|
||||||
|
def get_title(post):
|
||||||
|
if 'title' in post:
|
||||||
|
return post.title
|
||||||
|
else:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def remove_html_tags(txt):
|
||||||
|
p = re.compile(r'<[^<]*?/?>')
|
||||||
|
return p.sub('', txt)
|
||||||
|
|
||||||
|
def remove_extra_spaces(txt):
|
||||||
|
p = re.compile(r'\s+')
|
||||||
|
return p.sub(' ', txt)
|
||||||
|
|
||||||
|
def create_extract(txt):
|
||||||
|
stxt = remove_extra_spaces(remove_html_tags(txt))
|
||||||
|
if len(stxt) < 250:
|
||||||
|
return stxt
|
||||||
|
if stxt.rfind('. ',200,250)>0:
|
||||||
|
return stxt[:stxt.rfind('. ',200,250)+1]+" [...]"
|
||||||
|
if stxt.rfind('! ',200,250)>0:
|
||||||
|
return stxt[:stxt.rfind('! ',200,250)+1]+" [...]"
|
||||||
|
if stxt.rfind('? ',200,250)>0:
|
||||||
|
return stxt[:stxt.rfind('? ',200,250)+1]+" [...]"
|
||||||
|
if stxt.rfind(', ',200,250)>0:
|
||||||
|
return stxt[:stxt.rfind(', ',200,250)+1]+" [...]"
|
||||||
|
if stxt.rfind(' ',200,250)>0:
|
||||||
|
return stxt[:stxt.rfind(' ',200,250)]+" [...]"
|
||||||
|
return stxt[:250]+"[...]"
|
||||||
|
|
||||||
|
|
||||||
|
def parse_feeds(weeks, username, blog):
|
||||||
|
feedparser.USER_AGENT = "IronBloggerBot/0.2 +http://ironblogger.de/"
|
||||||
|
uri = blog[3]
|
||||||
|
feed = feedparser.parse(uri)
|
||||||
|
|
||||||
|
if not feed.entries:
|
||||||
|
print("WARN: no entries for ", uri, file=sys.stderr)
|
||||||
|
for post in feed.entries:
|
||||||
|
date = parse_published(get_date(post))
|
||||||
|
|
||||||
|
if date < START:
|
||||||
|
continue
|
||||||
|
|
||||||
|
key = date.strftime("%Y-%m-%d")
|
||||||
|
|
||||||
|
weeks.setdefault(key, [])
|
||||||
|
|
||||||
|
post = dict(date=date,
|
||||||
|
title=get_title(post),
|
||||||
|
url=get_link(post),
|
||||||
|
username=username,
|
||||||
|
blogname=blog[0],
|
||||||
|
description=create_extract(post.description))
|
||||||
|
if post['url'] not in [p['url'] for p in weeks[key]]:
|
||||||
|
weeks[key].append(post)
|
||||||
|
|
||||||
|
config=settings.load_settings()
|
||||||
|
|
||||||
|
if os.path.exists('data'):
|
||||||
|
shutil.rmtree('data')
|
||||||
|
|
||||||
|
gitrepo = Repo.clone_from('https://git.wazong.de/iron-blogger/content-society.git', 'data')
|
||||||
|
|
||||||
|
with open('data/bloggers.yml') as f:
|
||||||
|
users = yaml.safe_load(f.read())
|
||||||
|
|
||||||
|
if not os.path.exists('data/out'):
|
||||||
|
os.makedirs('data/out')
|
||||||
|
try:
|
||||||
|
with open('data/out/report.yml') as f:
|
||||||
|
log = yaml.safe_load(f.read())
|
||||||
|
except IOError:
|
||||||
|
log = {}
|
||||||
|
|
||||||
|
START = datetime.datetime.strptime(config['start_date'],'%Y/%m/%d')
|
||||||
|
|
||||||
|
if len(sys.argv) > 1:
|
||||||
|
for username in sys.argv[1:]:
|
||||||
|
blogs = log.setdefault(username, {})
|
||||||
|
for l in users[username]['links']:
|
||||||
|
parse_feeds(log, username, l)
|
||||||
|
else:
|
||||||
|
for (username, u) in list(users.items()):
|
||||||
|
if 'end' in u:
|
||||||
|
enddate = datetime.datetime.strptime(u['end'],'%Y/%m/%d')
|
||||||
|
if enddate < datetime.datetime.now():
|
||||||
|
print("User inactive: ", username)
|
||||||
|
continue
|
||||||
|
for l in u['links']:
|
||||||
|
parse_feeds(log, username, l)
|
||||||
|
|
||||||
|
with open('data/out/report.yml', 'w') as f:
|
||||||
|
yaml.safe_dump(log, f)
|
|
@ -1,3 +1,4 @@
|
||||||
pyyaml
|
pyyaml
|
||||||
feedparser
|
feedparser
|
||||||
python-dateutil
|
python-dateutil
|
||||||
|
GitPython
|
|
@ -1,86 +0,0 @@
|
||||||
#!/usr/bin/python3
|
|
||||||
import yaml
|
|
||||||
import feedparser
|
|
||||||
import datetime
|
|
||||||
import sys
|
|
||||||
import os
|
|
||||||
from dateutil.parser import parse
|
|
||||||
import dateutil.tz as tz
|
|
||||||
import settings
|
|
||||||
|
|
||||||
config=settings.load_settings()
|
|
||||||
|
|
||||||
with open('bloggers.yml') as f:
|
|
||||||
users = yaml.safe_load(f.read())
|
|
||||||
|
|
||||||
if not os.path.exists('out'):
|
|
||||||
os.makedirs('out')
|
|
||||||
try:
|
|
||||||
with open('out/report.yml') as f:
|
|
||||||
log = yaml.safe_load(f.read())
|
|
||||||
except IOError:
|
|
||||||
log = {}
|
|
||||||
|
|
||||||
START = datetime.datetime.strptime(config['start_date'],'%Y/%m/%d')
|
|
||||||
|
|
||||||
def parse_published(pub):
|
|
||||||
try:
|
|
||||||
return parse(pub).astimezone(tz.tzlocal()).replace(tzinfo=None)
|
|
||||||
except:
|
|
||||||
return parse(pub).replace(tzinfo=None)
|
|
||||||
|
|
||||||
def get_date(post):
|
|
||||||
for k in ('published', 'created', 'updated'):
|
|
||||||
if k in post:
|
|
||||||
return post[k]
|
|
||||||
|
|
||||||
def get_link(post):
|
|
||||||
return post.link
|
|
||||||
|
|
||||||
def parse_feeds(weeks, uri):
|
|
||||||
feedparser.USER_AGENT = "IronBloggerBot/0.2 +http://ironblogger.de/"
|
|
||||||
feed = feedparser.parse(uri)
|
|
||||||
|
|
||||||
if not feed.entries:
|
|
||||||
print("WARN: no entries for ", uri, file=sys.stderr)
|
|
||||||
for post in feed.entries:
|
|
||||||
date = parse_published(get_date(post))
|
|
||||||
|
|
||||||
if date < START:
|
|
||||||
continue
|
|
||||||
wn = int ( (date - START).days / 7 )
|
|
||||||
|
|
||||||
while len(weeks) <= wn:
|
|
||||||
weeks.append([])
|
|
||||||
|
|
||||||
if 'title' in post:
|
|
||||||
post = dict(date=date,
|
|
||||||
title=post.title,
|
|
||||||
url=get_link(post))
|
|
||||||
if 'title' not in post:
|
|
||||||
post = dict(date=date,
|
|
||||||
title="",
|
|
||||||
url=get_link(post))
|
|
||||||
if post['url'] not in [p['url'] for p in weeks[wn]]:
|
|
||||||
weeks[wn].append(post)
|
|
||||||
|
|
||||||
if len(sys.argv) > 1:
|
|
||||||
for username in sys.argv[1:]:
|
|
||||||
blogs = log.setdefault(username, {})
|
|
||||||
for l in users[username]['links']:
|
|
||||||
weeks = blogs.setdefault(l[0], [])
|
|
||||||
parse_feeds(weeks, l[3])
|
|
||||||
else:
|
|
||||||
for (username, u) in list(users.items()):
|
|
||||||
if 'end' in u:
|
|
||||||
enddate = datetime.datetime.strptime(u['end'],'%Y/%m/%d')
|
|
||||||
if enddate < datetime.datetime.now():
|
|
||||||
print("User inactive: ", username)
|
|
||||||
continue
|
|
||||||
blogs = log.setdefault(username, {})
|
|
||||||
for l in u['links']:
|
|
||||||
weeks = blogs.setdefault(l[0], [])
|
|
||||||
parse_feeds(weeks, l[3])
|
|
||||||
|
|
||||||
with open('out/report.yml', 'w') as f:
|
|
||||||
yaml.safe_dump(log, f)
|
|
Loading…
Reference in New Issue