Compare commits

...

16 Commits
v0.1 ... master

14 changed files with 150 additions and 54 deletions

View File

@ -28,7 +28,7 @@ HRSS is an application that allows you to transform any website into a RSS feed.
[Because we all love demos.](https://hrss.hipstercat.fr) [Because we all love demos.](https://hrss.hipstercat.fr)
# Installation # Installation (native)
1. Make sure you have Python3 installed and virtualenv. 1. Make sure you have Python3 installed and virtualenv.
2. Clone this repo for unstable branch, or download [latest release](https://hipstercat.fr/gogs/hipstercat/hrss/releases). 2. Clone this repo for unstable branch, or download [latest release](https://hipstercat.fr/gogs/hipstercat/hrss/releases).
@ -36,4 +36,7 @@ HRSS is an application that allows you to transform any website into a RSS feed.
4. `bin/activate && pip3 install -r requirements.txt` to install dependencies. 4. `bin/activate && pip3 install -r requirements.txt` to install dependencies.
5. `python3 manage.py migrate` to apply latest DB migrations (and create the H2 database file). 5. `python3 manage.py migrate` to apply latest DB migrations (and create the H2 database file).
6. `python3 manage.py runserver` to run the integrated webserver (not suitable for production use - even though you should not use HRSS in a production environnement at all until it is considered stable anyway). Use uWSGI to run it in a production environnement. 6. `python3 manage.py runserver` to run the integrated webserver (not suitable for production use - even though you should not use HRSS in a production environnement at all until it is considered stable anyway). Use uWSGI to run it in a production environnement.
# Installation (Docker)
Use https://github.com/kmlucy/docker-hrss nicely made by [kmlucy](https://github.com/kmlucy).

View File

@ -11,6 +11,7 @@ https://docs.djangoproject.com/en/2.0/ref/settings/
""" """
import os import os
import urllib.parse
# Build paths inside the project like this: os.path.join(BASE_DIR, ...) # Build paths inside the project like this: os.path.join(BASE_DIR, ...)
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
@ -27,6 +28,9 @@ DEBUG = True
ALLOWED_HOSTS = [] ALLOWED_HOSTS = []
# Set up your website base URL here, WITHOUT A TRAILING SLASH.
BASE_URL = "http://localhost:8000"
# Application definition # Application definition
@ -119,7 +123,43 @@ USE_TZ = True
# Static files (CSS, JavaScript, Images) # Static files (CSS, JavaScript, Images)
# https://docs.djangoproject.com/en/2.0/howto/static-files/ # https://docs.djangoproject.com/en/2.0/howto/static-files/
STATIC_URL = '/static/' STATIC_URL = '/'+urllib.parse.urlparse(BASE_URL).path[1:]+'/static/' if urllib.parse.urlparse(BASE_URL).path[1:] else '/static/'
STATIC_ROOT = os.path.join(BASE_DIR, 'static') STATIC_ROOT = os.path.join(BASE_DIR, 'static')
USE_X_FORWARDED_HOST = True USE_X_FORWARDED_HOST = True
LOGGING = {
'version': 1,
'disable_existing_loggers': False,
'formatters': {
'verbose': {
'format': '[%(asctime)s] [%(levelname)s] [%(module)s] %(message)s'
},
'simple': {
'format': '%(levelname)s %(message)s'
},
},
'handlers': {
'file': {
'class': 'logging.FileHandler',
'filename': 'debug.log',
'formatter': 'verbose'
},
'console': {
'class': 'logging.StreamHandler',
'formatter': 'verbose'
},
},
'loggers': {
'web': {
'handlers': ['console', 'file'],
'level': 'DEBUG',
},
'django': {
'handlers': ['console', 'file'],
'level': 'ERROR',
}
}
}
X_FRAME_OPTIONS = 'SAMEORIGIN'

View File

@ -16,8 +16,12 @@ Including another URLconf
from django.contrib import admin from django.contrib import admin
from django.urls import path from django.urls import path
from django.conf.urls import include, url from django.conf.urls import include, url
from django.conf import settings
import urllib.parse
base_path = urllib.parse.urlparse(settings.BASE_URL).path[1:]
urlpatterns = [ urlpatterns = [
path('admin/', admin.site.urls), path(base_path+'/admin/' if base_path else 'admin/', admin.site.urls),
url(r'', include('web.urls')), url(base_path+'/' if base_path else '', include('web.urls')),
] ]

4
requirements.txt Normal file
View File

@ -0,0 +1,4 @@
Django>=2.1.2
beautifulsoup4>=4.6.3
requests>=2.20.0
html5lib>=1.0.1

View File

@ -1,3 +1,4 @@
from django.contrib import admin from django.contrib import admin
from .models import Feed
# Register your models here. admin.site.register(Feed)

View File

@ -1,11 +1,9 @@
from django.conf import settings
import urllib.parse
def BASE_URL(request): def BASE_URL(request):
""" """
Return a BASE_URL template context for the current request. Return a BASE_URL template context for the current request.
""" """
if request.is_secure(): parse = urllib.parse.urlparse(settings.BASE_URL)
scheme = 'https://' return {'BASE_URL': parse.scheme+"://"+parse.netloc, }
else:
scheme = 'http://'
fullhost = request.get_host()
base = fullhost.split(":")[0]
return {'BASE_URL': scheme + base, }

View File

@ -10,7 +10,7 @@ def random_url():
class Feed(models.Model): class Feed(models.Model):
url = models.URLField(max_length=255) url = models.URLField(max_length=255)
element = models.CharField(max_length=255) element = models.CharField(max_length=1000)
title = models.CharField(max_length=255) title = models.CharField(max_length=255)
content = models.CharField(max_length=255) content = models.CharField(max_length=255)
date = models.CharField(max_length=255) date = models.CharField(max_length=255)
@ -18,4 +18,4 @@ class Feed(models.Model):
link = models.CharField(max_length=255) link = models.CharField(max_length=255)
creation_date = models.DateTimeField(auto_now=True) creation_date = models.DateTimeField(auto_now=True)
uurl = models.CharField(max_length=18, default=random_url, unique=True) uurl = models.CharField(max_length=18, default=random_url, unique=True)

View File

@ -19,7 +19,7 @@
<div id="collapse{{ feed.id }}" class="collapse" aria-labelledby="heading{{ feed.id }}" data-parent="#accordionExample"> <div id="collapse{{ feed.id }}" class="collapse" aria-labelledby="heading{{ feed.id }}" data-parent="#accordionExample">
<div class="card-body"> <div class="card-body">
<code>{{ BASE_URL }}/{{ feed.uurl }}.rss</code> <code>{{ BASE_URL }}{% url 'rss' uurl=feed.uurl %}</code>
<p><a class="btn btn-danger" href="{% url 'feed_delete' feed.id %}">Delete</a></p> <p><a class="btn btn-danger" href="{% url 'feed_delete' feed.id %}">Delete</a></p>
</div> </div>
</div> </div>

View File

@ -5,7 +5,7 @@
<div class="col-lg-12 text-center"> <div class="col-lg-12 text-center">
<h1 class="mt-5">Generate RSS out of any website</h1> <h1 class="mt-5">Generate RSS out of any website</h1>
{% if error %}<div class="alert alert-danger" role="alert">{{ error }}</div>{% endif %} {% if error %}<div class="alert alert-danger" role="alert">{{ error }}</div>{% endif %}
<form action="/" method="post"> <form action="{% url 'homepage' %}" method="post">
{% csrf_token %} {% csrf_token %}
<div class="form-group"> <div class="form-group">
<input type="text" class="form-control" name="url" id="url" placeholder="http://" {% if url %}value="{{ url }}"{% endif %}> <input type="text" class="form-control" name="url" id="url" placeholder="http://" {% if url %}value="{{ url }}"{% endif %}>

View File

@ -29,7 +29,7 @@
<code style="display:block" id="link-selector"></code> <code style="display:block" id="link-selector"></code>
</li> </li>
<li> <li>
<form action="/newfeed" method="post"> <form action="{% url 'newfeed' %}" method="post">
{% csrf_token %} {% csrf_token %}
<input type="hidden" id="url" name="url" value="{{ url }}"> <input type="hidden" id="url" name="url" value="{{ url }}">
<input type="hidden" id="element" name="element" value=""> <input type="hidden" id="element" name="element" value="">
@ -44,7 +44,7 @@
</ul> </ul>
</div> </div>
<!-- /#sidebar-wrapper --> <!-- /#sidebar-wrapper -->
<iframe id="preview" style="width:100%" src="/iframe/{{ url }}"></iframe> <iframe id="preview" style="width:100%" src="{% url 'iframe' encodedurl=encodedurl %}"></iframe>
<script> <script>
$(function() { $(function() {
function handleResize() { function handleResize() {

View File

@ -26,14 +26,14 @@
<!-- Navigation --> <!-- Navigation -->
<nav class="navbar navbar-expand-lg navbar-dark bg-dark static-top"> <nav class="navbar navbar-expand-lg navbar-dark bg-dark static-top">
<div class="container"> <div class="container">
<a class="navbar-brand" href="/">HRSS</a> <a class="navbar-brand" href="{% url 'homepage' %}">HRSS</a>
<button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbarResponsive" aria-controls="navbarResponsive" aria-expanded="false" aria-label="Toggle navigation"> <button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbarResponsive" aria-controls="navbarResponsive" aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span> <span class="navbar-toggler-icon"></span>
</button> </button>
<div class="collapse navbar-collapse" id="navbarResponsive"> <div class="collapse navbar-collapse" id="navbarResponsive">
<ul class="navbar-nav ml-auto"> <ul class="navbar-nav ml-auto">
<li class="nav-item"> <li class="nav-item">
<a class="nav-link" href="/">New <a class="nav-link" href="{% url 'homepage' %}">New
<span class="sr-only">(current)</span> <span class="sr-only">(current)</span>
</a> </a>
</li> </li>

View File

@ -3,8 +3,8 @@ from . import views
urlpatterns = [ urlpatterns = [
url(r'^$', views.homepage, name='homepage'), url(r'^$', views.homepage, name='homepage'),
url(r'^iframe/(?P<url>.+)$', views.iframe, name='iframe'), url(r'^iframe/(?P<encodedurl>.+)$', views.iframe, name='iframe'),
url(r'^setup/(?P<url>.+)$', views.setup, name='setup'), url(r'^setup/(?P<encodedurl>.+)$', views.setup, name='setup'),
url(r'^newfeed$', views.newfeed, name='newfeed'), url(r'^newfeed$', views.newfeed, name='newfeed'),
url(r'^feeds$', views.feeds, name='feeds'), url(r'^feeds$', views.feeds, name='feeds'),
url(r'^feeds/delete/(?P<id>[0-9]+)$', views.feed_delete, name='feed_delete'), url(r'^feeds/delete/(?P<id>[0-9]+)$', views.feed_delete, name='feed_delete'),

View File

@ -2,20 +2,25 @@ import requests
from django.core.validators import URLValidator from django.core.validators import URLValidator
from django.core.exceptions import ValidationError from django.core.exceptions import ValidationError
from requests.exceptions import RequestException from requests.exceptions import RequestException
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import logging
logger = logging.getLogger(__name__)
def get_url(url):
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0"}
return requests.get(url, headers=headers)
def is_valid_url(url): def is_valid_url(url):
try: try:
URLValidator()(url) URLValidator()(url)
requests.get(url).raise_for_status() get_url(url).raise_for_status()
return True return True
except ValidationError as e: except ValidationError as e:
print(url+" is not a valid url") logger.debug(url+" is not a valid url")
return False return False
except RequestException as e: except RequestException as e:
print(url+" led to exception: "+str(e)) logger.debug(url+" led to exception: "+str(e))
return False return False
@ -23,24 +28,34 @@ def find_longest_common_string(s1, s2):
from difflib import SequenceMatcher from difflib import SequenceMatcher
match = SequenceMatcher(None, s1, s2).find_longest_match(0, len(s1), 0, len(s2)) match = SequenceMatcher(None, s1, s2).find_longest_match(0, len(s1), 0, len(s2))
return s1[match.a: match.a + match.size] # -> apple pie return s1[match.a: match.a + match.size]
def fetch_feed(feed, limit=10): def fetch_feed(feed, limit=10):
logger.debug("Fetching feed "+feed.url)
items=[] items=[]
rep = requests.get(feed.url).text rep = get_url(feed.url).text
soup = BeautifulSoup(rep) soup = BeautifulSoup(rep, "html5lib")
elements = soup.select(feed.element) try:
elements = soup.select(feed.element)
except Exception as e:
logger.debug("Error while fetching elements ("+feed.element+"): "+str(e))
return False
logger.debug("Found "+str(len(elements))+" news")
for element in elements: for element in elements:
if len(items) >= limit: if len(items) >= limit:
break break
logger.debug("Title ("+feed.element+ " > "+feed.title+")")
try: try:
title = element.select(feed.title)[0].text title = element.select(feed.title)[0].text
except Exception: logger.debug("Match title "+title)
except Exception as e:
logger.debug("Error while selecting feed title: "+str(e))
return False return False
try: try:
content = element.select(feed.content)[0].text content = element.select(feed.content)[0].text
except Exception: except Exception as e:
logger.debug("Error while selecting content: "+str(e))
return False return False
try: try:
date = element.select(feed.date)[0].text date = element.select(feed.date)[0].text
@ -52,6 +67,12 @@ def fetch_feed(feed, limit=10):
author = False author = False
try: try:
link = element.select(feed.link)[0]["href"] link = element.select(feed.link)[0]["href"]
if link and len(link) > 2 and link[0] == "/" and link[1] != "/":
# fixes issue #5 with relative link
# prepend base url
base_scheme = feed.url.split("://")[0]
base_url = feed.url.split("//")[-1].split("/")[0].split('?')[0]
link = base_scheme + "://" + base_url + link
except Exception: except Exception:
link = False link = False
items.append({"title": title, "content": content, "pubDate": date, "author": author, "link": link}) items.append({"title": title, "content": content, "pubDate": date, "author": author, "link": link})

View File

@ -4,22 +4,30 @@ from .utils import *
from .models import Feed from .models import Feed
from django.db.models import ObjectDoesNotExist from django.db.models import ObjectDoesNotExist
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import logging
from urllib.parse import quote_plus, unquote_plus
import traceback
import sys
# Create your views here. # Create your views here.
logger = logging.getLogger(__name__)
def iframe(request, url): def iframe(request, encodedurl):
content_type = False sys.setrecursionlimit(10000)
try: try:
req = requests.get(url) url = unquote_plus(encodedurl)
req = get_url(url)
html = req.content html = req.content
bs = False bs = False
content_type = req.headers["Content-Type"] if "Content-Type" in req.headers else False content_type = req.headers["Content-Type"] if "Content-Type" in req.headers else False
if not content_type or content_type.startswith("text/html"): if not content_type or content_type.startswith("text/html"):
logger.debug("No content-type or content-type ~= '^text/html'")
bs = BeautifulSoup(html, 'html.parser') bs = BeautifulSoup(html, 'html.parser')
base_scheme = url.split("://")[0] base_scheme = url.split("://")[0]
base_url = url.split("//")[-1].split("/")[0].split('?')[0] base_url = url.split("//")[-1].split("/")[0].split('?')[0]
logger.debug("URL: "+base_scheme+"://"+base_url)
# fixes # fixes
# fix click links # fix click links
@ -33,6 +41,11 @@ def iframe(request, url):
if link["href"].startswith("/"): if link["href"].startswith("/"):
link["href"] = "/iframe/" + base_scheme + "://" + base_url + link["href"] link["href"] = "/iframe/" + base_scheme + "://" + base_url + link["href"]
# test: remove js
all_scripts = bs.find_all("script")
for script in all_scripts:
script.extract()
# fix absolute javascript # fix absolute javascript
all_scripts = bs.find_all("script", {"src": True}) all_scripts = bs.find_all("script", {"src": True})
for script in all_scripts: for script in all_scripts:
@ -48,32 +61,37 @@ def iframe(request, url):
html = final_html html = final_html
except Exception as e: except Exception as e:
html = str(e) traceback.print_exc()
return HttpResponse("An error has occured", content_type=500)
return HttpResponse(html, content_type=content_type) return HttpResponse(html, content_type=content_type)
def dummy(request): def dummy(request):
return HttpResponse("toto") return HttpResponse("toto")
def homepage(request): def homepage(request):
if request.method == 'POST': if request.method == 'POST':
if "url" in request.POST and request.POST["url"]: if "url" in request.POST and request.POST["url"]:
url = request.POST["url"] url = request.POST["url"]
if is_valid_url(url): if is_valid_url(url):
return redirect("/setup/"+url) return redirect("setup", encodedurl=quote_plus(url))
else: else:
return render(request, 'homepage.html', {"url": url, "error": url+" is not a valid URL."}) return render(request, 'homepage.html', {"url": url, "error": url+" is not a valid URL."})
return render(request, 'homepage.html') return render(request, 'homepage.html')
def setup(request, url):
if is_valid_url(url): def setup(request, encodedurl):
return render(request, 'setup.html', {"url": url}) decoded_url = unquote_plus(encodedurl)
if is_valid_url(decoded_url):
return render(request, 'setup.html', {"encodedurl": encodedurl, "url": decoded_url})
else: else:
return redirect("/") return redirect("homepage")
def newfeed(request): def newfeed(request):
if request.method == 'POST': if request.method == 'POST':
if not "url" in request.POST or not "element" in request.POST or not "title" in request.POST or not "content" in request.POST: if not "url" in request.POST or not request.POST["url"] or not "element" in request.POST or not request.POST["element"] or not "title" in request.POST or not request.POST["title"] or not "content" in request.POST or not request.POST["content"]:
return HttpResponse("Error, missing required element") return HttpResponse("Error, missing required element")
url = request.POST["url"] url = request.POST["url"]
element = request.POST["element"] element = request.POST["element"]
@ -101,9 +119,9 @@ def newfeed(request):
feed = Feed(url=url, element=element, title=title, content=content, date=date, author=author, link=link) feed = Feed(url=url, element=element, title=title, content=content, date=date, author=author, link=link)
feed.save() feed.save()
return redirect("/feeds") return redirect("feeds")
else: else:
return redirect("/") return redirect("homepage")
def feeds(request): def feeds(request):
feeds = Feed.objects.all() feeds = Feed.objects.all()
@ -111,15 +129,22 @@ def feeds(request):
def feed_delete(request, id): def feed_delete(request, id):
try: try:
Feed.objects.get(pk=id).delete() # demo website: disable deleting feeds
except Exception as e: if not request.get_host() == "hrss.hipstercat.fr:443":
pass Feed.objects.get(pk=id).delete()
finally: logger.info("Removed feed ID "+id)
return redirect("/feeds") return redirect("feeds")
else:
return HttpResponse("Deleting is disabled on demo website.", status=403)
except ObjectDoesNotExist:
return redirect("feeds")
def rss(request, uurl): def rss(request, uurl):
try: try:
feed = Feed.objects.get(uurl=uurl) feed = Feed.objects.get(uurl=uurl)
return render(request, "feed.xml", {"feed": feed, "rss": fetch_feed(feed)}, content_type="application/rss+xml") fetched = fetch_feed(feed)
if not fetched or not fetched["items"]:
return HttpResponse("Error: feed is empty. Did you set up 'element' field correctly?", status=422)
return render(request, "feed.xml", {"feed": feed, "rss": fetched}, content_type="application/rss+xml")
except ObjectDoesNotExist: except ObjectDoesNotExist:
return HttpResponse("nope") return HttpResponse("Error: feed is unknown", status=404)