Added logging, use html5lib instead of html.parser (better results overall), disallow adding feed with missing required fields
This commit is contained in:
parent
be90bbd67c
commit
91d7839703
@ -123,3 +123,37 @@ STATIC_URL = '/static/'
|
|||||||
STATIC_ROOT = os.path.join(BASE_DIR, 'static')
|
STATIC_ROOT = os.path.join(BASE_DIR, 'static')
|
||||||
|
|
||||||
USE_X_FORWARDED_HOST = True
|
USE_X_FORWARDED_HOST = True
|
||||||
|
|
||||||
|
LOGGING = {
|
||||||
|
'version': 1,
|
||||||
|
'disable_existing_loggers': False,
|
||||||
|
'formatters': {
|
||||||
|
'verbose': {
|
||||||
|
'format': '[%(asctime)s] [%(levelname)s] [%(module)s] %(message)s'
|
||||||
|
},
|
||||||
|
'simple': {
|
||||||
|
'format': '%(levelname)s %(message)s'
|
||||||
|
},
|
||||||
|
},
|
||||||
|
'handlers': {
|
||||||
|
'file': {
|
||||||
|
'class': 'logging.FileHandler',
|
||||||
|
'filename': 'debug.log',
|
||||||
|
'formatter': 'verbose'
|
||||||
|
},
|
||||||
|
'console': {
|
||||||
|
'class': 'logging.StreamHandler',
|
||||||
|
'formatter': 'verbose'
|
||||||
|
},
|
||||||
|
},
|
||||||
|
'loggers': {
|
||||||
|
'web': {
|
||||||
|
'handlers': ['console', 'file'],
|
||||||
|
'level': 'DEBUG',
|
||||||
|
},
|
||||||
|
'django': {
|
||||||
|
'handlers': ['console', 'file'],
|
||||||
|
'level': 'ERROR',
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -1,3 +1,4 @@
|
|||||||
Django>=2.1.2
|
Django>=2.1.2
|
||||||
beautifulsoup4>=4.6.3
|
beautifulsoup4>=4.6.3
|
||||||
requests>=2.20.0
|
requests>=2.20.0
|
||||||
|
html5lib>=1.0.1
|
37
web/utils.py
37
web/utils.py
@ -2,20 +2,25 @@ import requests
|
|||||||
from django.core.validators import URLValidator
|
from django.core.validators import URLValidator
|
||||||
from django.core.exceptions import ValidationError
|
from django.core.exceptions import ValidationError
|
||||||
from requests.exceptions import RequestException
|
from requests.exceptions import RequestException
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
def get_url(url):
|
||||||
|
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0"}
|
||||||
|
return requests.get(url, headers=headers)
|
||||||
|
|
||||||
def is_valid_url(url):
|
def is_valid_url(url):
|
||||||
try:
|
try:
|
||||||
URLValidator()(url)
|
URLValidator()(url)
|
||||||
requests.get(url).raise_for_status()
|
get_url(url).raise_for_status()
|
||||||
return True
|
return True
|
||||||
except ValidationError as e:
|
except ValidationError as e:
|
||||||
print(url+" is not a valid url")
|
logger.debug(url+" is not a valid url")
|
||||||
return False
|
return False
|
||||||
except RequestException as e:
|
except RequestException as e:
|
||||||
print(url+" led to exception: "+str(e))
|
logger.debug(url+" led to exception: "+str(e))
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
@ -23,24 +28,34 @@ def find_longest_common_string(s1, s2):
|
|||||||
from difflib import SequenceMatcher
|
from difflib import SequenceMatcher
|
||||||
|
|
||||||
match = SequenceMatcher(None, s1, s2).find_longest_match(0, len(s1), 0, len(s2))
|
match = SequenceMatcher(None, s1, s2).find_longest_match(0, len(s1), 0, len(s2))
|
||||||
return s1[match.a: match.a + match.size] # -> apple pie
|
return s1[match.a: match.a + match.size]
|
||||||
|
|
||||||
|
|
||||||
def fetch_feed(feed, limit=10):
|
def fetch_feed(feed, limit=10):
|
||||||
|
logger.debug("Fetching feed "+feed.url)
|
||||||
items=[]
|
items=[]
|
||||||
rep = requests.get(feed.url).text
|
rep = get_url(feed.url).text
|
||||||
soup = BeautifulSoup(rep)
|
soup = BeautifulSoup(rep, "html5lib")
|
||||||
elements = soup.select(feed.element)
|
try:
|
||||||
|
elements = soup.select(feed.element)
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug("Error while fetching elements ("+feed.element+"): "+str(e))
|
||||||
|
return False
|
||||||
|
logger.debug("Found "+str(len(elements))+" news")
|
||||||
for element in elements:
|
for element in elements:
|
||||||
if len(items) >= limit:
|
if len(items) >= limit:
|
||||||
break
|
break
|
||||||
|
logger.debug("Title ("+feed.element+ " > "+feed.title+")")
|
||||||
try:
|
try:
|
||||||
title = element.select(feed.title)[0].text
|
title = element.select(feed.title)[0].text
|
||||||
except Exception:
|
logger.debug("Match title "+title)
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug("Error while selecting feed title: "+str(e))
|
||||||
return False
|
return False
|
||||||
try:
|
try:
|
||||||
content = element.select(feed.content)[0].text
|
content = element.select(feed.content)[0].text
|
||||||
except Exception:
|
except Exception as e:
|
||||||
|
logger.debug("Error while selecting content: "+str(e))
|
||||||
return False
|
return False
|
||||||
try:
|
try:
|
||||||
date = element.select(feed.date)[0].text
|
date = element.select(feed.date)[0].text
|
||||||
|
25
web/views.py
25
web/views.py
@ -4,22 +4,25 @@ from .utils import *
|
|||||||
from .models import Feed
|
from .models import Feed
|
||||||
from django.db.models import ObjectDoesNotExist
|
from django.db.models import ObjectDoesNotExist
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
import logging
|
||||||
# Create your views here.
|
# Create your views here.
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
def iframe(request, url):
|
def iframe(request, url):
|
||||||
content_type = False
|
|
||||||
try:
|
try:
|
||||||
req = requests.get(url)
|
req = get_url(url)
|
||||||
html = req.content
|
html = req.content
|
||||||
bs = False
|
bs = False
|
||||||
|
|
||||||
content_type = req.headers["Content-Type"] if "Content-Type" in req.headers else False
|
content_type = req.headers["Content-Type"] if "Content-Type" in req.headers else False
|
||||||
|
|
||||||
if not content_type or content_type.startswith("text/html"):
|
if not content_type or content_type.startswith("text/html"):
|
||||||
|
logger.debug("No content-type or content-type ~= '^text/html'")
|
||||||
bs = BeautifulSoup(html, 'html.parser')
|
bs = BeautifulSoup(html, 'html.parser')
|
||||||
base_scheme = url.split("://")[0]
|
base_scheme = url.split("://")[0]
|
||||||
base_url = url.split("//")[-1].split("/")[0].split('?')[0]
|
base_url = url.split("//")[-1].split("/")[0].split('?')[0]
|
||||||
|
logger.debug("URL: "+base_scheme+"://"+base_url)
|
||||||
|
|
||||||
# fixes
|
# fixes
|
||||||
# fix click links
|
# fix click links
|
||||||
@ -33,6 +36,11 @@ def iframe(request, url):
|
|||||||
if link["href"].startswith("/"):
|
if link["href"].startswith("/"):
|
||||||
link["href"] = "/iframe/" + base_scheme + "://" + base_url + link["href"]
|
link["href"] = "/iframe/" + base_scheme + "://" + base_url + link["href"]
|
||||||
|
|
||||||
|
# test: remove js
|
||||||
|
all_scripts = bs.find_all("script")
|
||||||
|
for script in all_scripts:
|
||||||
|
script.extract()
|
||||||
|
|
||||||
# fix absolute javascript
|
# fix absolute javascript
|
||||||
all_scripts = bs.find_all("script", {"src": True})
|
all_scripts = bs.find_all("script", {"src": True})
|
||||||
for script in all_scripts:
|
for script in all_scripts:
|
||||||
@ -48,7 +56,8 @@ def iframe(request, url):
|
|||||||
html = final_html
|
html = final_html
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
html = str(e)
|
logger.debug(e)
|
||||||
|
return HttpResponse("An error has occured", content_type=500)
|
||||||
return HttpResponse(html, content_type=content_type)
|
return HttpResponse(html, content_type=content_type)
|
||||||
|
|
||||||
def dummy(request):
|
def dummy(request):
|
||||||
@ -73,7 +82,7 @@ def setup(request, url):
|
|||||||
|
|
||||||
def newfeed(request):
|
def newfeed(request):
|
||||||
if request.method == 'POST':
|
if request.method == 'POST':
|
||||||
if not "url" in request.POST or not "element" in request.POST or not "title" in request.POST or not "content" in request.POST:
|
if not "url" in request.POST or not request.POST["url"] or not "element" in request.POST or not request.POST["element"] or not "title" in request.POST or not request.POST["title"] or not "content" in request.POST or not request.POST["content"]:
|
||||||
return HttpResponse("Error, missing required element")
|
return HttpResponse("Error, missing required element")
|
||||||
url = request.POST["url"]
|
url = request.POST["url"]
|
||||||
element = request.POST["element"]
|
element = request.POST["element"]
|
||||||
@ -114,6 +123,7 @@ def feed_delete(request, id):
|
|||||||
# demo website: disable deleting feeds
|
# demo website: disable deleting feeds
|
||||||
if not request.get_host() == "hrss.hipstercat.fr:443":
|
if not request.get_host() == "hrss.hipstercat.fr:443":
|
||||||
Feed.objects.get(pk=id).delete()
|
Feed.objects.get(pk=id).delete()
|
||||||
|
logger.info("Removed feed ID "+id)
|
||||||
return redirect("/feeds")
|
return redirect("/feeds")
|
||||||
else:
|
else:
|
||||||
return HttpResponse("Deleting is disabled on demo website.", status=403)
|
return HttpResponse("Deleting is disabled on demo website.", status=403)
|
||||||
@ -123,6 +133,9 @@ def feed_delete(request, id):
|
|||||||
def rss(request, uurl):
|
def rss(request, uurl):
|
||||||
try:
|
try:
|
||||||
feed = Feed.objects.get(uurl=uurl)
|
feed = Feed.objects.get(uurl=uurl)
|
||||||
return render(request, "feed.xml", {"feed": feed, "rss": fetch_feed(feed)}, content_type="application/rss+xml")
|
fetched = fetch_feed(feed)
|
||||||
|
if not fetched or not fetched["items"]:
|
||||||
|
return HttpResponse("Error: feed is empty. Did you set up 'element' field correctly?", status=422)
|
||||||
|
return render(request, "feed.xml", {"feed": feed, "rss": fetched}, content_type="application/rss+xml")
|
||||||
except ObjectDoesNotExist:
|
except ObjectDoesNotExist:
|
||||||
return HttpResponse("nope")
|
return HttpResponse("Error: feed is unknown", status=404)
|
Loading…
Reference in New Issue
Block a user