hrss/web/utils.py

79 lines
2.8 KiB
Python
Raw Normal View History

2018-10-24 23:57:41 +02:00
import requests
from django.core.validators import URLValidator
from django.core.exceptions import ValidationError
from requests.exceptions import RequestException
from bs4 import BeautifulSoup
import logging
logger = logging.getLogger(__name__)
2018-10-24 23:57:41 +02:00
def get_url(url):
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0"}
return requests.get(url, headers=headers)
2018-10-24 23:57:41 +02:00
def is_valid_url(url):
try:
URLValidator()(url)
get_url(url).raise_for_status()
2018-10-24 23:57:41 +02:00
return True
except ValidationError as e:
logger.debug(url+" is not a valid url")
2018-10-24 23:57:41 +02:00
return False
except RequestException as e:
logger.debug(url+" led to exception: "+str(e))
2018-10-24 23:57:41 +02:00
return False
def find_longest_common_string(s1, s2):
from difflib import SequenceMatcher
match = SequenceMatcher(None, s1, s2).find_longest_match(0, len(s1), 0, len(s2))
return s1[match.a: match.a + match.size]
2018-10-24 23:57:41 +02:00
def fetch_feed(feed, limit=10):
logger.debug("Fetching feed "+feed.url)
2018-10-24 23:57:41 +02:00
items=[]
rep = get_url(feed.url).text
soup = BeautifulSoup(rep, "html5lib")
try:
elements = soup.select(feed.element)
except Exception as e:
logger.debug("Error while fetching elements ("+feed.element+"): "+str(e))
return False
logger.debug("Found "+str(len(elements))+" news")
2018-10-24 23:57:41 +02:00
for element in elements:
if len(items) >= limit:
break
logger.debug("Title ("+feed.element+ " > "+feed.title+")")
2018-10-24 23:57:41 +02:00
try:
title = element.select(feed.title)[0].text
logger.debug("Match title "+title)
except Exception as e:
logger.debug("Error while selecting feed title: "+str(e))
2018-10-24 23:57:41 +02:00
return False
try:
content = element.select(feed.content)[0].text
except Exception as e:
logger.debug("Error while selecting content: "+str(e))
2018-10-24 23:57:41 +02:00
return False
try:
date = element.select(feed.date)[0].text
except Exception:
date = False
try:
author = element.select(feed.author)[0].text
except Exception:
author = False
try:
link = element.select(feed.link)[0]["href"]
if link and len(link) > 2 and link[0] == "/" and link[1] != "/":
# fixes issue #5 with relative link
# prepend base url
base_scheme = feed.url.split("://")[0]
base_url = feed.url.split("//")[-1].split("/")[0].split('?')[0]
link = base_scheme + "://" + base_url + link
2018-10-24 23:57:41 +02:00
except Exception:
link = False
items.append({"title": title, "content": content, "pubDate": date, "author": author, "link": link})
return {"title": soup.find("title").text, "items": items}