Added logging, use html5lib instead of html.parser (better results overall), disallow adding feed with missing required fields

This commit is contained in:
Amazed 2018-10-30 16:58:01 +01:00
parent be90bbd67c
commit 91d7839703
4 changed files with 82 additions and 19 deletions

View File

@ -122,4 +122,38 @@ USE_TZ = True
STATIC_URL = '/static/' STATIC_URL = '/static/'
STATIC_ROOT = os.path.join(BASE_DIR, 'static') STATIC_ROOT = os.path.join(BASE_DIR, 'static')
USE_X_FORWARDED_HOST = True USE_X_FORWARDED_HOST = True
LOGGING = {
'version': 1,
'disable_existing_loggers': False,
'formatters': {
'verbose': {
'format': '[%(asctime)s] [%(levelname)s] [%(module)s] %(message)s'
},
'simple': {
'format': '%(levelname)s %(message)s'
},
},
'handlers': {
'file': {
'class': 'logging.FileHandler',
'filename': 'debug.log',
'formatter': 'verbose'
},
'console': {
'class': 'logging.StreamHandler',
'formatter': 'verbose'
},
},
'loggers': {
'web': {
'handlers': ['console', 'file'],
'level': 'DEBUG',
},
'django': {
'handlers': ['console', 'file'],
'level': 'ERROR',
}
}
}

View File

@ -1,3 +1,4 @@
Django>=2.1.2 Django>=2.1.2
beautifulsoup4>=4.6.3 beautifulsoup4>=4.6.3
requests>=2.20.0 requests>=2.20.0
html5lib>=1.0.1

View File

@ -2,20 +2,25 @@ import requests
from django.core.validators import URLValidator from django.core.validators import URLValidator
from django.core.exceptions import ValidationError from django.core.exceptions import ValidationError
from requests.exceptions import RequestException from requests.exceptions import RequestException
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import logging
logger = logging.getLogger(__name__)
def get_url(url):
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0"}
return requests.get(url, headers=headers)
def is_valid_url(url): def is_valid_url(url):
try: try:
URLValidator()(url) URLValidator()(url)
requests.get(url).raise_for_status() get_url(url).raise_for_status()
return True return True
except ValidationError as e: except ValidationError as e:
print(url+" is not a valid url") logger.debug(url+" is not a valid url")
return False return False
except RequestException as e: except RequestException as e:
print(url+" led to exception: "+str(e)) logger.debug(url+" led to exception: "+str(e))
return False return False
@ -23,24 +28,34 @@ def find_longest_common_string(s1, s2):
from difflib import SequenceMatcher from difflib import SequenceMatcher
match = SequenceMatcher(None, s1, s2).find_longest_match(0, len(s1), 0, len(s2)) match = SequenceMatcher(None, s1, s2).find_longest_match(0, len(s1), 0, len(s2))
return s1[match.a: match.a + match.size] # -> apple pie return s1[match.a: match.a + match.size]
def fetch_feed(feed, limit=10): def fetch_feed(feed, limit=10):
logger.debug("Fetching feed "+feed.url)
items=[] items=[]
rep = requests.get(feed.url).text rep = get_url(feed.url).text
soup = BeautifulSoup(rep) soup = BeautifulSoup(rep, "html5lib")
elements = soup.select(feed.element) try:
elements = soup.select(feed.element)
except Exception as e:
logger.debug("Error while fetching elements ("+feed.element+"): "+str(e))
return False
logger.debug("Found "+str(len(elements))+" news")
for element in elements: for element in elements:
if len(items) >= limit: if len(items) >= limit:
break break
logger.debug("Title ("+feed.element+ " > "+feed.title+")")
try: try:
title = element.select(feed.title)[0].text title = element.select(feed.title)[0].text
except Exception: logger.debug("Match title "+title)
except Exception as e:
logger.debug("Error while selecting feed title: "+str(e))
return False return False
try: try:
content = element.select(feed.content)[0].text content = element.select(feed.content)[0].text
except Exception: except Exception as e:
logger.debug("Error while selecting content: "+str(e))
return False return False
try: try:
date = element.select(feed.date)[0].text date = element.select(feed.date)[0].text

View File

@ -4,22 +4,25 @@ from .utils import *
from .models import Feed from .models import Feed
from django.db.models import ObjectDoesNotExist from django.db.models import ObjectDoesNotExist
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import logging
# Create your views here. # Create your views here.
logger = logging.getLogger(__name__)
def iframe(request, url): def iframe(request, url):
content_type = False
try: try:
req = requests.get(url) req = get_url(url)
html = req.content html = req.content
bs = False bs = False
content_type = req.headers["Content-Type"] if "Content-Type" in req.headers else False content_type = req.headers["Content-Type"] if "Content-Type" in req.headers else False
if not content_type or content_type.startswith("text/html"): if not content_type or content_type.startswith("text/html"):
logger.debug("No content-type or content-type ~= '^text/html'")
bs = BeautifulSoup(html, 'html.parser') bs = BeautifulSoup(html, 'html.parser')
base_scheme = url.split("://")[0] base_scheme = url.split("://")[0]
base_url = url.split("//")[-1].split("/")[0].split('?')[0] base_url = url.split("//")[-1].split("/")[0].split('?')[0]
logger.debug("URL: "+base_scheme+"://"+base_url)
# fixes # fixes
# fix click links # fix click links
@ -33,6 +36,11 @@ def iframe(request, url):
if link["href"].startswith("/"): if link["href"].startswith("/"):
link["href"] = "/iframe/" + base_scheme + "://" + base_url + link["href"] link["href"] = "/iframe/" + base_scheme + "://" + base_url + link["href"]
# test: remove js
all_scripts = bs.find_all("script")
for script in all_scripts:
script.extract()
# fix absolute javascript # fix absolute javascript
all_scripts = bs.find_all("script", {"src": True}) all_scripts = bs.find_all("script", {"src": True})
for script in all_scripts: for script in all_scripts:
@ -48,7 +56,8 @@ def iframe(request, url):
html = final_html html = final_html
except Exception as e: except Exception as e:
html = str(e) logger.debug(e)
return HttpResponse("An error has occured", content_type=500)
return HttpResponse(html, content_type=content_type) return HttpResponse(html, content_type=content_type)
def dummy(request): def dummy(request):
@ -73,7 +82,7 @@ def setup(request, url):
def newfeed(request): def newfeed(request):
if request.method == 'POST': if request.method == 'POST':
if not "url" in request.POST or not "element" in request.POST or not "title" in request.POST or not "content" in request.POST: if not "url" in request.POST or not request.POST["url"] or not "element" in request.POST or not request.POST["element"] or not "title" in request.POST or not request.POST["title"] or not "content" in request.POST or not request.POST["content"]:
return HttpResponse("Error, missing required element") return HttpResponse("Error, missing required element")
url = request.POST["url"] url = request.POST["url"]
element = request.POST["element"] element = request.POST["element"]
@ -114,6 +123,7 @@ def feed_delete(request, id):
# demo website: disable deleting feeds # demo website: disable deleting feeds
if not request.get_host() == "hrss.hipstercat.fr:443": if not request.get_host() == "hrss.hipstercat.fr:443":
Feed.objects.get(pk=id).delete() Feed.objects.get(pk=id).delete()
logger.info("Removed feed ID "+id)
return redirect("/feeds") return redirect("/feeds")
else: else:
return HttpResponse("Deleting is disabled on demo website.", status=403) return HttpResponse("Deleting is disabled on demo website.", status=403)
@ -123,6 +133,9 @@ def feed_delete(request, id):
def rss(request, uurl): def rss(request, uurl):
try: try:
feed = Feed.objects.get(uurl=uurl) feed = Feed.objects.get(uurl=uurl)
return render(request, "feed.xml", {"feed": feed, "rss": fetch_feed(feed)}, content_type="application/rss+xml") fetched = fetch_feed(feed)
if not fetched or not fetched["items"]:
return HttpResponse("Error: feed is empty. Did you set up 'element' field correctly?", status=422)
return render(request, "feed.xml", {"feed": feed, "rss": fetched}, content_type="application/rss+xml")
except ObjectDoesNotExist: except ObjectDoesNotExist:
return HttpResponse("nope") return HttpResponse("Error: feed is unknown", status=404)