import requests from django.core.validators import URLValidator from django.core.exceptions import ValidationError from requests.exceptions import RequestException from bs4 import BeautifulSoup import logging logger = logging.getLogger(__name__) def get_url(url): headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0"} return requests.get(url, headers=headers) def is_valid_url(url): try: URLValidator()(url) get_url(url).raise_for_status() return True except ValidationError as e: logger.debug(url+" is not a valid url") return False except RequestException as e: logger.debug(url+" led to exception: "+str(e)) return False def find_longest_common_string(s1, s2): from difflib import SequenceMatcher match = SequenceMatcher(None, s1, s2).find_longest_match(0, len(s1), 0, len(s2)) return s1[match.a: match.a + match.size] def fetch_feed(feed, limit=10): logger.debug("Fetching feed "+feed.url) items=[] rep = get_url(feed.url).text soup = BeautifulSoup(rep, "html5lib") try: elements = soup.select(feed.element) except Exception as e: logger.debug("Error while fetching elements ("+feed.element+"): "+str(e)) return False logger.debug("Found "+str(len(elements))+" news") for element in elements: if len(items) >= limit: break logger.debug("Title ("+feed.element+ " > "+feed.title+")") try: title = element.select(feed.title)[0].text logger.debug("Match title "+title) except Exception as e: logger.debug("Error while selecting feed title: "+str(e)) return False try: content = element.select(feed.content)[0].text except Exception as e: logger.debug("Error while selecting content: "+str(e)) return False try: date = element.select(feed.date)[0].text except Exception: date = False try: author = element.select(feed.author)[0].text except Exception: author = False try: link = element.select(feed.link)[0]["href"] if link and len(link) > 2 and link[0] == "/" and link[1] != "/": # fixes issue #5 with relative link # prepend base url base_scheme = feed.url.split("://")[0] base_url = feed.url.split("//")[-1].split("/")[0].split('?')[0] link = base_scheme + "://" + base_url + link except Exception: link = False items.append({"title": title, "content": content, "pubDate": date, "author": author, "link": link}) return {"title": soup.find("title").text, "items": items}