2018-10-24 23:57:41 +02:00
|
|
|
import requests
|
|
|
|
from django.core.validators import URLValidator
|
|
|
|
from django.core.exceptions import ValidationError
|
|
|
|
from requests.exceptions import RequestException
|
|
|
|
from bs4 import BeautifulSoup
|
2018-10-30 16:58:01 +01:00
|
|
|
import logging
|
|
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
2018-10-24 23:57:41 +02:00
|
|
|
|
2018-10-30 16:58:01 +01:00
|
|
|
def get_url(url):
|
|
|
|
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0"}
|
|
|
|
return requests.get(url, headers=headers)
|
2018-10-24 23:57:41 +02:00
|
|
|
|
|
|
|
def is_valid_url(url):
|
|
|
|
try:
|
|
|
|
URLValidator()(url)
|
2018-10-30 16:58:01 +01:00
|
|
|
get_url(url).raise_for_status()
|
2018-10-24 23:57:41 +02:00
|
|
|
return True
|
|
|
|
except ValidationError as e:
|
2018-10-30 16:58:01 +01:00
|
|
|
logger.debug(url+" is not a valid url")
|
2018-10-24 23:57:41 +02:00
|
|
|
return False
|
|
|
|
except RequestException as e:
|
2018-10-30 16:58:01 +01:00
|
|
|
logger.debug(url+" led to exception: "+str(e))
|
2018-10-24 23:57:41 +02:00
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
def find_longest_common_string(s1, s2):
|
|
|
|
from difflib import SequenceMatcher
|
|
|
|
|
|
|
|
match = SequenceMatcher(None, s1, s2).find_longest_match(0, len(s1), 0, len(s2))
|
2018-10-30 16:58:01 +01:00
|
|
|
return s1[match.a: match.a + match.size]
|
|
|
|
|
2018-10-24 23:57:41 +02:00
|
|
|
|
|
|
|
def fetch_feed(feed, limit=10):
|
2018-10-30 16:58:01 +01:00
|
|
|
logger.debug("Fetching feed "+feed.url)
|
2018-10-24 23:57:41 +02:00
|
|
|
items=[]
|
2018-10-30 16:58:01 +01:00
|
|
|
rep = get_url(feed.url).text
|
|
|
|
soup = BeautifulSoup(rep, "html5lib")
|
|
|
|
try:
|
|
|
|
elements = soup.select(feed.element)
|
|
|
|
except Exception as e:
|
|
|
|
logger.debug("Error while fetching elements ("+feed.element+"): "+str(e))
|
|
|
|
return False
|
|
|
|
logger.debug("Found "+str(len(elements))+" news")
|
2018-10-24 23:57:41 +02:00
|
|
|
for element in elements:
|
|
|
|
if len(items) >= limit:
|
|
|
|
break
|
2018-10-30 16:58:01 +01:00
|
|
|
logger.debug("Title ("+feed.element+ " > "+feed.title+")")
|
2018-10-24 23:57:41 +02:00
|
|
|
try:
|
|
|
|
title = element.select(feed.title)[0].text
|
2018-10-30 16:58:01 +01:00
|
|
|
logger.debug("Match title "+title)
|
|
|
|
except Exception as e:
|
|
|
|
logger.debug("Error while selecting feed title: "+str(e))
|
2018-10-24 23:57:41 +02:00
|
|
|
return False
|
|
|
|
try:
|
|
|
|
content = element.select(feed.content)[0].text
|
2018-10-30 16:58:01 +01:00
|
|
|
except Exception as e:
|
|
|
|
logger.debug("Error while selecting content: "+str(e))
|
2018-10-24 23:57:41 +02:00
|
|
|
return False
|
|
|
|
try:
|
|
|
|
date = element.select(feed.date)[0].text
|
|
|
|
except Exception:
|
|
|
|
date = False
|
|
|
|
try:
|
|
|
|
author = element.select(feed.author)[0].text
|
|
|
|
except Exception:
|
|
|
|
author = False
|
|
|
|
try:
|
|
|
|
link = element.select(feed.link)[0]["href"]
|
2019-03-22 10:13:28 +01:00
|
|
|
if link and len(link) > 2 and link[0] == "/" and link[1] != "/":
|
|
|
|
# fixes issue #5 with relative link
|
|
|
|
# prepend base url
|
|
|
|
base_scheme = feed.url.split("://")[0]
|
|
|
|
base_url = feed.url.split("//")[-1].split("/")[0].split('?')[0]
|
|
|
|
link = base_scheme + "://" + base_url + link
|
2018-10-24 23:57:41 +02:00
|
|
|
except Exception:
|
|
|
|
link = False
|
|
|
|
items.append({"title": title, "content": content, "pubDate": date, "author": author, "link": link})
|
|
|
|
return {"title": soup.find("title").text, "items": items}
|