Added logging, use html5lib instead of html.parser (better results overall), disallow adding feed with missing required fields

This commit is contained in:
Amazed 2018-10-30 16:58:01 +01:00
parent be90bbd67c
commit 91d7839703
4 changed files with 82 additions and 19 deletions

View File

@ -122,4 +122,38 @@ USE_TZ = True
STATIC_URL = '/static/'
STATIC_ROOT = os.path.join(BASE_DIR, 'static')
USE_X_FORWARDED_HOST = True
USE_X_FORWARDED_HOST = True
LOGGING = {
'version': 1,
'disable_existing_loggers': False,
'formatters': {
'verbose': {
'format': '[%(asctime)s] [%(levelname)s] [%(module)s] %(message)s'
},
'simple': {
'format': '%(levelname)s %(message)s'
},
},
'handlers': {
'file': {
'class': 'logging.FileHandler',
'filename': 'debug.log',
'formatter': 'verbose'
},
'console': {
'class': 'logging.StreamHandler',
'formatter': 'verbose'
},
},
'loggers': {
'web': {
'handlers': ['console', 'file'],
'level': 'DEBUG',
},
'django': {
'handlers': ['console', 'file'],
'level': 'ERROR',
}
}
}

View File

@ -1,3 +1,4 @@
Django>=2.1.2
beautifulsoup4>=4.6.3
requests>=2.20.0
requests>=2.20.0
html5lib>=1.0.1

View File

@ -2,20 +2,25 @@ import requests
from django.core.validators import URLValidator
from django.core.exceptions import ValidationError
from requests.exceptions import RequestException
from bs4 import BeautifulSoup
import logging
logger = logging.getLogger(__name__)
def get_url(url):
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0"}
return requests.get(url, headers=headers)
def is_valid_url(url):
try:
URLValidator()(url)
requests.get(url).raise_for_status()
get_url(url).raise_for_status()
return True
except ValidationError as e:
print(url+" is not a valid url")
logger.debug(url+" is not a valid url")
return False
except RequestException as e:
print(url+" led to exception: "+str(e))
logger.debug(url+" led to exception: "+str(e))
return False
@ -23,24 +28,34 @@ def find_longest_common_string(s1, s2):
from difflib import SequenceMatcher
match = SequenceMatcher(None, s1, s2).find_longest_match(0, len(s1), 0, len(s2))
return s1[match.a: match.a + match.size] # -> apple pie
return s1[match.a: match.a + match.size]
def fetch_feed(feed, limit=10):
logger.debug("Fetching feed "+feed.url)
items=[]
rep = requests.get(feed.url).text
soup = BeautifulSoup(rep)
elements = soup.select(feed.element)
rep = get_url(feed.url).text
soup = BeautifulSoup(rep, "html5lib")
try:
elements = soup.select(feed.element)
except Exception as e:
logger.debug("Error while fetching elements ("+feed.element+"): "+str(e))
return False
logger.debug("Found "+str(len(elements))+" news")
for element in elements:
if len(items) >= limit:
break
logger.debug("Title ("+feed.element+ " > "+feed.title+")")
try:
title = element.select(feed.title)[0].text
except Exception:
logger.debug("Match title "+title)
except Exception as e:
logger.debug("Error while selecting feed title: "+str(e))
return False
try:
content = element.select(feed.content)[0].text
except Exception:
except Exception as e:
logger.debug("Error while selecting content: "+str(e))
return False
try:
date = element.select(feed.date)[0].text

View File

@ -4,22 +4,25 @@ from .utils import *
from .models import Feed
from django.db.models import ObjectDoesNotExist
from bs4 import BeautifulSoup
import logging
# Create your views here.
logger = logging.getLogger(__name__)
def iframe(request, url):
content_type = False
try:
req = requests.get(url)
req = get_url(url)
html = req.content
bs = False
content_type = req.headers["Content-Type"] if "Content-Type" in req.headers else False
if not content_type or content_type.startswith("text/html"):
logger.debug("No content-type or content-type ~= '^text/html'")
bs = BeautifulSoup(html, 'html.parser')
base_scheme = url.split("://")[0]
base_url = url.split("//")[-1].split("/")[0].split('?')[0]
logger.debug("URL: "+base_scheme+"://"+base_url)
# fixes
# fix click links
@ -33,6 +36,11 @@ def iframe(request, url):
if link["href"].startswith("/"):
link["href"] = "/iframe/" + base_scheme + "://" + base_url + link["href"]
# test: remove js
all_scripts = bs.find_all("script")
for script in all_scripts:
script.extract()
# fix absolute javascript
all_scripts = bs.find_all("script", {"src": True})
for script in all_scripts:
@ -48,7 +56,8 @@ def iframe(request, url):
html = final_html
except Exception as e:
html = str(e)
logger.debug(e)
return HttpResponse("An error has occured", content_type=500)
return HttpResponse(html, content_type=content_type)
def dummy(request):
@ -73,7 +82,7 @@ def setup(request, url):
def newfeed(request):
if request.method == 'POST':
if not "url" in request.POST or not "element" in request.POST or not "title" in request.POST or not "content" in request.POST:
if not "url" in request.POST or not request.POST["url"] or not "element" in request.POST or not request.POST["element"] or not "title" in request.POST or not request.POST["title"] or not "content" in request.POST or not request.POST["content"]:
return HttpResponse("Error, missing required element")
url = request.POST["url"]
element = request.POST["element"]
@ -114,6 +123,7 @@ def feed_delete(request, id):
# demo website: disable deleting feeds
if not request.get_host() == "hrss.hipstercat.fr:443":
Feed.objects.get(pk=id).delete()
logger.info("Removed feed ID "+id)
return redirect("/feeds")
else:
return HttpResponse("Deleting is disabled on demo website.", status=403)
@ -123,6 +133,9 @@ def feed_delete(request, id):
def rss(request, uurl):
try:
feed = Feed.objects.get(uurl=uurl)
return render(request, "feed.xml", {"feed": feed, "rss": fetch_feed(feed)}, content_type="application/rss+xml")
fetched = fetch_feed(feed)
if not fetched or not fetched["items"]:
return HttpResponse("Error: feed is empty. Did you set up 'element' field correctly?", status=422)
return render(request, "feed.xml", {"feed": feed, "rss": fetched}, content_type="application/rss+xml")
except ObjectDoesNotExist:
return HttpResponse("nope")
return HttpResponse("Error: feed is unknown", status=404)