From 91d78397034e938e107e6888e595b304fe29e0f3 Mon Sep 17 00:00:00 2001 From: HipsterCat Date: Tue, 30 Oct 2018 16:58:01 +0100 Subject: [PATCH] Added logging, use html5lib instead of html.parser (better results overall), disallow adding feed with missing required fields --- hrss/settings.py | 36 +++++++++++++++++++++++++++++++++++- requirements.txt | 3 ++- web/utils.py | 37 ++++++++++++++++++++++++++----------- web/views.py | 25 +++++++++++++++++++------ 4 files changed, 82 insertions(+), 19 deletions(-) diff --git a/hrss/settings.py b/hrss/settings.py index 29cb71a..8a93130 100644 --- a/hrss/settings.py +++ b/hrss/settings.py @@ -122,4 +122,38 @@ USE_TZ = True STATIC_URL = '/static/' STATIC_ROOT = os.path.join(BASE_DIR, 'static') -USE_X_FORWARDED_HOST = True \ No newline at end of file +USE_X_FORWARDED_HOST = True + +LOGGING = { + 'version': 1, + 'disable_existing_loggers': False, + 'formatters': { + 'verbose': { + 'format': '[%(asctime)s] [%(levelname)s] [%(module)s] %(message)s' + }, + 'simple': { + 'format': '%(levelname)s %(message)s' + }, + }, + 'handlers': { + 'file': { + 'class': 'logging.FileHandler', + 'filename': 'debug.log', + 'formatter': 'verbose' + }, + 'console': { + 'class': 'logging.StreamHandler', + 'formatter': 'verbose' + }, + }, + 'loggers': { + 'web': { + 'handlers': ['console', 'file'], + 'level': 'DEBUG', + }, + 'django': { + 'handlers': ['console', 'file'], + 'level': 'ERROR', + } + } +} \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 34d744b..d322895 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ Django>=2.1.2 beautifulsoup4>=4.6.3 -requests>=2.20.0 \ No newline at end of file +requests>=2.20.0 +html5lib>=1.0.1 \ No newline at end of file diff --git a/web/utils.py b/web/utils.py index 1a39b85..bb49ba2 100644 --- a/web/utils.py +++ b/web/utils.py @@ -2,20 +2,25 @@ import requests from django.core.validators import URLValidator from django.core.exceptions import ValidationError from requests.exceptions import RequestException - from bs4 import BeautifulSoup +import logging +logger = logging.getLogger(__name__) + +def get_url(url): + headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0"} + return requests.get(url, headers=headers) def is_valid_url(url): try: URLValidator()(url) - requests.get(url).raise_for_status() + get_url(url).raise_for_status() return True except ValidationError as e: - print(url+" is not a valid url") + logger.debug(url+" is not a valid url") return False except RequestException as e: - print(url+" led to exception: "+str(e)) + logger.debug(url+" led to exception: "+str(e)) return False @@ -23,24 +28,34 @@ def find_longest_common_string(s1, s2): from difflib import SequenceMatcher match = SequenceMatcher(None, s1, s2).find_longest_match(0, len(s1), 0, len(s2)) - return s1[match.a: match.a + match.size] # -> apple pie + return s1[match.a: match.a + match.size] + def fetch_feed(feed, limit=10): + logger.debug("Fetching feed "+feed.url) items=[] - rep = requests.get(feed.url).text - soup = BeautifulSoup(rep) - elements = soup.select(feed.element) + rep = get_url(feed.url).text + soup = BeautifulSoup(rep, "html5lib") + try: + elements = soup.select(feed.element) + except Exception as e: + logger.debug("Error while fetching elements ("+feed.element+"): "+str(e)) + return False + logger.debug("Found "+str(len(elements))+" news") for element in elements: if len(items) >= limit: break - + logger.debug("Title ("+feed.element+ " > "+feed.title+")") try: title = element.select(feed.title)[0].text - except Exception: + logger.debug("Match title "+title) + except Exception as e: + logger.debug("Error while selecting feed title: "+str(e)) return False try: content = element.select(feed.content)[0].text - except Exception: + except Exception as e: + logger.debug("Error while selecting content: "+str(e)) return False try: date = element.select(feed.date)[0].text diff --git a/web/views.py b/web/views.py index cb6337d..5108639 100644 --- a/web/views.py +++ b/web/views.py @@ -4,22 +4,25 @@ from .utils import * from .models import Feed from django.db.models import ObjectDoesNotExist from bs4 import BeautifulSoup +import logging # Create your views here. +logger = logging.getLogger(__name__) def iframe(request, url): - content_type = False try: - req = requests.get(url) + req = get_url(url) html = req.content bs = False content_type = req.headers["Content-Type"] if "Content-Type" in req.headers else False if not content_type or content_type.startswith("text/html"): + logger.debug("No content-type or content-type ~= '^text/html'") bs = BeautifulSoup(html, 'html.parser') base_scheme = url.split("://")[0] base_url = url.split("//")[-1].split("/")[0].split('?')[0] + logger.debug("URL: "+base_scheme+"://"+base_url) # fixes # fix click links @@ -33,6 +36,11 @@ def iframe(request, url): if link["href"].startswith("/"): link["href"] = "/iframe/" + base_scheme + "://" + base_url + link["href"] + # test: remove js + all_scripts = bs.find_all("script") + for script in all_scripts: + script.extract() + # fix absolute javascript all_scripts = bs.find_all("script", {"src": True}) for script in all_scripts: @@ -48,7 +56,8 @@ def iframe(request, url): html = final_html except Exception as e: - html = str(e) + logger.debug(e) + return HttpResponse("An error has occured", content_type=500) return HttpResponse(html, content_type=content_type) def dummy(request): @@ -73,7 +82,7 @@ def setup(request, url): def newfeed(request): if request.method == 'POST': - if not "url" in request.POST or not "element" in request.POST or not "title" in request.POST or not "content" in request.POST: + if not "url" in request.POST or not request.POST["url"] or not "element" in request.POST or not request.POST["element"] or not "title" in request.POST or not request.POST["title"] or not "content" in request.POST or not request.POST["content"]: return HttpResponse("Error, missing required element") url = request.POST["url"] element = request.POST["element"] @@ -114,6 +123,7 @@ def feed_delete(request, id): # demo website: disable deleting feeds if not request.get_host() == "hrss.hipstercat.fr:443": Feed.objects.get(pk=id).delete() + logger.info("Removed feed ID "+id) return redirect("/feeds") else: return HttpResponse("Deleting is disabled on demo website.", status=403) @@ -123,6 +133,9 @@ def feed_delete(request, id): def rss(request, uurl): try: feed = Feed.objects.get(uurl=uurl) - return render(request, "feed.xml", {"feed": feed, "rss": fetch_feed(feed)}, content_type="application/rss+xml") + fetched = fetch_feed(feed) + if not fetched or not fetched["items"]: + return HttpResponse("Error: feed is empty. Did you set up 'element' field correctly?", status=422) + return render(request, "feed.xml", {"feed": feed, "rss": fetched}, content_type="application/rss+xml") except ObjectDoesNotExist: - return HttpResponse("nope") \ No newline at end of file + return HttpResponse("Error: feed is unknown", status=404) \ No newline at end of file