Added logging, use html5lib instead of html.parser (better results overall), disallow adding feed with missing required fields
This commit is contained in:
parent
be90bbd67c
commit
91d7839703
@ -123,3 +123,37 @@ STATIC_URL = '/static/'
|
||||
STATIC_ROOT = os.path.join(BASE_DIR, 'static')
|
||||
|
||||
USE_X_FORWARDED_HOST = True
|
||||
|
||||
LOGGING = {
|
||||
'version': 1,
|
||||
'disable_existing_loggers': False,
|
||||
'formatters': {
|
||||
'verbose': {
|
||||
'format': '[%(asctime)s] [%(levelname)s] [%(module)s] %(message)s'
|
||||
},
|
||||
'simple': {
|
||||
'format': '%(levelname)s %(message)s'
|
||||
},
|
||||
},
|
||||
'handlers': {
|
||||
'file': {
|
||||
'class': 'logging.FileHandler',
|
||||
'filename': 'debug.log',
|
||||
'formatter': 'verbose'
|
||||
},
|
||||
'console': {
|
||||
'class': 'logging.StreamHandler',
|
||||
'formatter': 'verbose'
|
||||
},
|
||||
},
|
||||
'loggers': {
|
||||
'web': {
|
||||
'handlers': ['console', 'file'],
|
||||
'level': 'DEBUG',
|
||||
},
|
||||
'django': {
|
||||
'handlers': ['console', 'file'],
|
||||
'level': 'ERROR',
|
||||
}
|
||||
}
|
||||
}
|
@ -1,3 +1,4 @@
|
||||
Django>=2.1.2
|
||||
beautifulsoup4>=4.6.3
|
||||
requests>=2.20.0
|
||||
html5lib>=1.0.1
|
37
web/utils.py
37
web/utils.py
@ -2,20 +2,25 @@ import requests
|
||||
from django.core.validators import URLValidator
|
||||
from django.core.exceptions import ValidationError
|
||||
from requests.exceptions import RequestException
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def get_url(url):
|
||||
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0"}
|
||||
return requests.get(url, headers=headers)
|
||||
|
||||
def is_valid_url(url):
|
||||
try:
|
||||
URLValidator()(url)
|
||||
requests.get(url).raise_for_status()
|
||||
get_url(url).raise_for_status()
|
||||
return True
|
||||
except ValidationError as e:
|
||||
print(url+" is not a valid url")
|
||||
logger.debug(url+" is not a valid url")
|
||||
return False
|
||||
except RequestException as e:
|
||||
print(url+" led to exception: "+str(e))
|
||||
logger.debug(url+" led to exception: "+str(e))
|
||||
return False
|
||||
|
||||
|
||||
@ -23,24 +28,34 @@ def find_longest_common_string(s1, s2):
|
||||
from difflib import SequenceMatcher
|
||||
|
||||
match = SequenceMatcher(None, s1, s2).find_longest_match(0, len(s1), 0, len(s2))
|
||||
return s1[match.a: match.a + match.size] # -> apple pie
|
||||
return s1[match.a: match.a + match.size]
|
||||
|
||||
|
||||
def fetch_feed(feed, limit=10):
|
||||
logger.debug("Fetching feed "+feed.url)
|
||||
items=[]
|
||||
rep = requests.get(feed.url).text
|
||||
soup = BeautifulSoup(rep)
|
||||
elements = soup.select(feed.element)
|
||||
rep = get_url(feed.url).text
|
||||
soup = BeautifulSoup(rep, "html5lib")
|
||||
try:
|
||||
elements = soup.select(feed.element)
|
||||
except Exception as e:
|
||||
logger.debug("Error while fetching elements ("+feed.element+"): "+str(e))
|
||||
return False
|
||||
logger.debug("Found "+str(len(elements))+" news")
|
||||
for element in elements:
|
||||
if len(items) >= limit:
|
||||
break
|
||||
|
||||
logger.debug("Title ("+feed.element+ " > "+feed.title+")")
|
||||
try:
|
||||
title = element.select(feed.title)[0].text
|
||||
except Exception:
|
||||
logger.debug("Match title "+title)
|
||||
except Exception as e:
|
||||
logger.debug("Error while selecting feed title: "+str(e))
|
||||
return False
|
||||
try:
|
||||
content = element.select(feed.content)[0].text
|
||||
except Exception:
|
||||
except Exception as e:
|
||||
logger.debug("Error while selecting content: "+str(e))
|
||||
return False
|
||||
try:
|
||||
date = element.select(feed.date)[0].text
|
||||
|
25
web/views.py
25
web/views.py
@ -4,22 +4,25 @@ from .utils import *
|
||||
from .models import Feed
|
||||
from django.db.models import ObjectDoesNotExist
|
||||
from bs4 import BeautifulSoup
|
||||
import logging
|
||||
# Create your views here.
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def iframe(request, url):
|
||||
content_type = False
|
||||
try:
|
||||
req = requests.get(url)
|
||||
req = get_url(url)
|
||||
html = req.content
|
||||
bs = False
|
||||
|
||||
content_type = req.headers["Content-Type"] if "Content-Type" in req.headers else False
|
||||
|
||||
if not content_type or content_type.startswith("text/html"):
|
||||
logger.debug("No content-type or content-type ~= '^text/html'")
|
||||
bs = BeautifulSoup(html, 'html.parser')
|
||||
base_scheme = url.split("://")[0]
|
||||
base_url = url.split("//")[-1].split("/")[0].split('?')[0]
|
||||
logger.debug("URL: "+base_scheme+"://"+base_url)
|
||||
|
||||
# fixes
|
||||
# fix click links
|
||||
@ -33,6 +36,11 @@ def iframe(request, url):
|
||||
if link["href"].startswith("/"):
|
||||
link["href"] = "/iframe/" + base_scheme + "://" + base_url + link["href"]
|
||||
|
||||
# test: remove js
|
||||
all_scripts = bs.find_all("script")
|
||||
for script in all_scripts:
|
||||
script.extract()
|
||||
|
||||
# fix absolute javascript
|
||||
all_scripts = bs.find_all("script", {"src": True})
|
||||
for script in all_scripts:
|
||||
@ -48,7 +56,8 @@ def iframe(request, url):
|
||||
html = final_html
|
||||
|
||||
except Exception as e:
|
||||
html = str(e)
|
||||
logger.debug(e)
|
||||
return HttpResponse("An error has occured", content_type=500)
|
||||
return HttpResponse(html, content_type=content_type)
|
||||
|
||||
def dummy(request):
|
||||
@ -73,7 +82,7 @@ def setup(request, url):
|
||||
|
||||
def newfeed(request):
|
||||
if request.method == 'POST':
|
||||
if not "url" in request.POST or not "element" in request.POST or not "title" in request.POST or not "content" in request.POST:
|
||||
if not "url" in request.POST or not request.POST["url"] or not "element" in request.POST or not request.POST["element"] or not "title" in request.POST or not request.POST["title"] or not "content" in request.POST or not request.POST["content"]:
|
||||
return HttpResponse("Error, missing required element")
|
||||
url = request.POST["url"]
|
||||
element = request.POST["element"]
|
||||
@ -114,6 +123,7 @@ def feed_delete(request, id):
|
||||
# demo website: disable deleting feeds
|
||||
if not request.get_host() == "hrss.hipstercat.fr:443":
|
||||
Feed.objects.get(pk=id).delete()
|
||||
logger.info("Removed feed ID "+id)
|
||||
return redirect("/feeds")
|
||||
else:
|
||||
return HttpResponse("Deleting is disabled on demo website.", status=403)
|
||||
@ -123,6 +133,9 @@ def feed_delete(request, id):
|
||||
def rss(request, uurl):
|
||||
try:
|
||||
feed = Feed.objects.get(uurl=uurl)
|
||||
return render(request, "feed.xml", {"feed": feed, "rss": fetch_feed(feed)}, content_type="application/rss+xml")
|
||||
fetched = fetch_feed(feed)
|
||||
if not fetched or not fetched["items"]:
|
||||
return HttpResponse("Error: feed is empty. Did you set up 'element' field correctly?", status=422)
|
||||
return render(request, "feed.xml", {"feed": feed, "rss": fetched}, content_type="application/rss+xml")
|
||||
except ObjectDoesNotExist:
|
||||
return HttpResponse("nope")
|
||||
return HttpResponse("Error: feed is unknown", status=404)
|
Loading…
Reference in New Issue
Block a user