Compare commits

...

16 Commits
v0.1 ... master

14 changed files with 150 additions and 54 deletions

View File

@ -28,7 +28,7 @@ HRSS is an application that allows you to transform any website into a RSS feed.
[Because we all love demos.](https://hrss.hipstercat.fr)
# Installation
# Installation (native)
1. Make sure you have Python3 installed and virtualenv.
2. Clone this repo for unstable branch, or download [latest release](https://hipstercat.fr/gogs/hipstercat/hrss/releases).
@ -36,4 +36,7 @@ HRSS is an application that allows you to transform any website into a RSS feed.
4. `bin/activate && pip3 install -r requirements.txt` to install dependencies.
5. `python3 manage.py migrate` to apply latest DB migrations (and create the H2 database file).
6. `python3 manage.py runserver` to run the integrated webserver (not suitable for production use - even though you should not use HRSS in a production environnement at all until it is considered stable anyway). Use uWSGI to run it in a production environnement.
# Installation (Docker)
Use https://github.com/kmlucy/docker-hrss nicely made by [kmlucy](https://github.com/kmlucy).

View File

@ -11,6 +11,7 @@ https://docs.djangoproject.com/en/2.0/ref/settings/
"""
import os
import urllib.parse
# Build paths inside the project like this: os.path.join(BASE_DIR, ...)
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
@ -27,6 +28,9 @@ DEBUG = True
ALLOWED_HOSTS = []
# Set up your website base URL here, WITHOUT A TRAILING SLASH.
BASE_URL = "http://localhost:8000"
# Application definition
@ -119,7 +123,43 @@ USE_TZ = True
# Static files (CSS, JavaScript, Images)
# https://docs.djangoproject.com/en/2.0/howto/static-files/
STATIC_URL = '/static/'
STATIC_URL = '/'+urllib.parse.urlparse(BASE_URL).path[1:]+'/static/' if urllib.parse.urlparse(BASE_URL).path[1:] else '/static/'
STATIC_ROOT = os.path.join(BASE_DIR, 'static')
USE_X_FORWARDED_HOST = True
USE_X_FORWARDED_HOST = True
LOGGING = {
'version': 1,
'disable_existing_loggers': False,
'formatters': {
'verbose': {
'format': '[%(asctime)s] [%(levelname)s] [%(module)s] %(message)s'
},
'simple': {
'format': '%(levelname)s %(message)s'
},
},
'handlers': {
'file': {
'class': 'logging.FileHandler',
'filename': 'debug.log',
'formatter': 'verbose'
},
'console': {
'class': 'logging.StreamHandler',
'formatter': 'verbose'
},
},
'loggers': {
'web': {
'handlers': ['console', 'file'],
'level': 'DEBUG',
},
'django': {
'handlers': ['console', 'file'],
'level': 'ERROR',
}
}
}
X_FRAME_OPTIONS = 'SAMEORIGIN'

View File

@ -16,8 +16,12 @@ Including another URLconf
from django.contrib import admin
from django.urls import path
from django.conf.urls import include, url
from django.conf import settings
import urllib.parse
base_path = urllib.parse.urlparse(settings.BASE_URL).path[1:]
urlpatterns = [
path('admin/', admin.site.urls),
url(r'', include('web.urls')),
path(base_path+'/admin/' if base_path else 'admin/', admin.site.urls),
url(base_path+'/' if base_path else '', include('web.urls')),
]

4
requirements.txt Normal file
View File

@ -0,0 +1,4 @@
Django>=2.1.2
beautifulsoup4>=4.6.3
requests>=2.20.0
html5lib>=1.0.1

View File

@ -1,3 +1,4 @@
from django.contrib import admin
from .models import Feed
# Register your models here.
admin.site.register(Feed)

View File

@ -1,11 +1,9 @@
from django.conf import settings
import urllib.parse
def BASE_URL(request):
"""
Return a BASE_URL template context for the current request.
"""
if request.is_secure():
scheme = 'https://'
else:
scheme = 'http://'
fullhost = request.get_host()
base = fullhost.split(":")[0]
return {'BASE_URL': scheme + base, }
parse = urllib.parse.urlparse(settings.BASE_URL)
return {'BASE_URL': parse.scheme+"://"+parse.netloc, }

View File

@ -10,7 +10,7 @@ def random_url():
class Feed(models.Model):
url = models.URLField(max_length=255)
element = models.CharField(max_length=255)
element = models.CharField(max_length=1000)
title = models.CharField(max_length=255)
content = models.CharField(max_length=255)
date = models.CharField(max_length=255)
@ -18,4 +18,4 @@ class Feed(models.Model):
link = models.CharField(max_length=255)
creation_date = models.DateTimeField(auto_now=True)
uurl = models.CharField(max_length=18, default=random_url, unique=True)
uurl = models.CharField(max_length=18, default=random_url, unique=True)

View File

@ -19,7 +19,7 @@
<div id="collapse{{ feed.id }}" class="collapse" aria-labelledby="heading{{ feed.id }}" data-parent="#accordionExample">
<div class="card-body">
<code>{{ BASE_URL }}/{{ feed.uurl }}.rss</code>
<code>{{ BASE_URL }}{% url 'rss' uurl=feed.uurl %}</code>
<p><a class="btn btn-danger" href="{% url 'feed_delete' feed.id %}">Delete</a></p>
</div>
</div>

View File

@ -5,7 +5,7 @@
<div class="col-lg-12 text-center">
<h1 class="mt-5">Generate RSS out of any website</h1>
{% if error %}<div class="alert alert-danger" role="alert">{{ error }}</div>{% endif %}
<form action="/" method="post">
<form action="{% url 'homepage' %}" method="post">
{% csrf_token %}
<div class="form-group">
<input type="text" class="form-control" name="url" id="url" placeholder="http://" {% if url %}value="{{ url }}"{% endif %}>

View File

@ -29,7 +29,7 @@
<code style="display:block" id="link-selector"></code>
</li>
<li>
<form action="/newfeed" method="post">
<form action="{% url 'newfeed' %}" method="post">
{% csrf_token %}
<input type="hidden" id="url" name="url" value="{{ url }}">
<input type="hidden" id="element" name="element" value="">
@ -44,7 +44,7 @@
</ul>
</div>
<!-- /#sidebar-wrapper -->
<iframe id="preview" style="width:100%" src="/iframe/{{ url }}"></iframe>
<iframe id="preview" style="width:100%" src="{% url 'iframe' encodedurl=encodedurl %}"></iframe>
<script>
$(function() {
function handleResize() {

View File

@ -26,14 +26,14 @@
<!-- Navigation -->
<nav class="navbar navbar-expand-lg navbar-dark bg-dark static-top">
<div class="container">
<a class="navbar-brand" href="/">HRSS</a>
<a class="navbar-brand" href="{% url 'homepage' %}">HRSS</a>
<button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbarResponsive" aria-controls="navbarResponsive" aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
</button>
<div class="collapse navbar-collapse" id="navbarResponsive">
<ul class="navbar-nav ml-auto">
<li class="nav-item">
<a class="nav-link" href="/">New
<a class="nav-link" href="{% url 'homepage' %}">New
<span class="sr-only">(current)</span>
</a>
</li>

View File

@ -3,8 +3,8 @@ from . import views
urlpatterns = [
url(r'^$', views.homepage, name='homepage'),
url(r'^iframe/(?P<url>.+)$', views.iframe, name='iframe'),
url(r'^setup/(?P<url>.+)$', views.setup, name='setup'),
url(r'^iframe/(?P<encodedurl>.+)$', views.iframe, name='iframe'),
url(r'^setup/(?P<encodedurl>.+)$', views.setup, name='setup'),
url(r'^newfeed$', views.newfeed, name='newfeed'),
url(r'^feeds$', views.feeds, name='feeds'),
url(r'^feeds/delete/(?P<id>[0-9]+)$', views.feed_delete, name='feed_delete'),

View File

@ -2,20 +2,25 @@ import requests
from django.core.validators import URLValidator
from django.core.exceptions import ValidationError
from requests.exceptions import RequestException
from bs4 import BeautifulSoup
import logging
logger = logging.getLogger(__name__)
def get_url(url):
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0"}
return requests.get(url, headers=headers)
def is_valid_url(url):
try:
URLValidator()(url)
requests.get(url).raise_for_status()
get_url(url).raise_for_status()
return True
except ValidationError as e:
print(url+" is not a valid url")
logger.debug(url+" is not a valid url")
return False
except RequestException as e:
print(url+" led to exception: "+str(e))
logger.debug(url+" led to exception: "+str(e))
return False
@ -23,24 +28,34 @@ def find_longest_common_string(s1, s2):
from difflib import SequenceMatcher
match = SequenceMatcher(None, s1, s2).find_longest_match(0, len(s1), 0, len(s2))
return s1[match.a: match.a + match.size] # -> apple pie
return s1[match.a: match.a + match.size]
def fetch_feed(feed, limit=10):
logger.debug("Fetching feed "+feed.url)
items=[]
rep = requests.get(feed.url).text
soup = BeautifulSoup(rep)
elements = soup.select(feed.element)
rep = get_url(feed.url).text
soup = BeautifulSoup(rep, "html5lib")
try:
elements = soup.select(feed.element)
except Exception as e:
logger.debug("Error while fetching elements ("+feed.element+"): "+str(e))
return False
logger.debug("Found "+str(len(elements))+" news")
for element in elements:
if len(items) >= limit:
break
logger.debug("Title ("+feed.element+ " > "+feed.title+")")
try:
title = element.select(feed.title)[0].text
except Exception:
logger.debug("Match title "+title)
except Exception as e:
logger.debug("Error while selecting feed title: "+str(e))
return False
try:
content = element.select(feed.content)[0].text
except Exception:
except Exception as e:
logger.debug("Error while selecting content: "+str(e))
return False
try:
date = element.select(feed.date)[0].text
@ -52,6 +67,12 @@ def fetch_feed(feed, limit=10):
author = False
try:
link = element.select(feed.link)[0]["href"]
if link and len(link) > 2 and link[0] == "/" and link[1] != "/":
# fixes issue #5 with relative link
# prepend base url
base_scheme = feed.url.split("://")[0]
base_url = feed.url.split("//")[-1].split("/")[0].split('?')[0]
link = base_scheme + "://" + base_url + link
except Exception:
link = False
items.append({"title": title, "content": content, "pubDate": date, "author": author, "link": link})

View File

@ -4,22 +4,30 @@ from .utils import *
from .models import Feed
from django.db.models import ObjectDoesNotExist
from bs4 import BeautifulSoup
import logging
from urllib.parse import quote_plus, unquote_plus
import traceback
import sys
# Create your views here.
logger = logging.getLogger(__name__)
def iframe(request, url):
content_type = False
def iframe(request, encodedurl):
sys.setrecursionlimit(10000)
try:
req = requests.get(url)
url = unquote_plus(encodedurl)
req = get_url(url)
html = req.content
bs = False
content_type = req.headers["Content-Type"] if "Content-Type" in req.headers else False
if not content_type or content_type.startswith("text/html"):
logger.debug("No content-type or content-type ~= '^text/html'")
bs = BeautifulSoup(html, 'html.parser')
base_scheme = url.split("://")[0]
base_url = url.split("//")[-1].split("/")[0].split('?')[0]
logger.debug("URL: "+base_scheme+"://"+base_url)
# fixes
# fix click links
@ -33,6 +41,11 @@ def iframe(request, url):
if link["href"].startswith("/"):
link["href"] = "/iframe/" + base_scheme + "://" + base_url + link["href"]
# test: remove js
all_scripts = bs.find_all("script")
for script in all_scripts:
script.extract()
# fix absolute javascript
all_scripts = bs.find_all("script", {"src": True})
for script in all_scripts:
@ -48,32 +61,37 @@ def iframe(request, url):
html = final_html
except Exception as e:
html = str(e)
traceback.print_exc()
return HttpResponse("An error has occured", content_type=500)
return HttpResponse(html, content_type=content_type)
def dummy(request):
return HttpResponse("toto")
def homepage(request):
if request.method == 'POST':
if "url" in request.POST and request.POST["url"]:
url = request.POST["url"]
if is_valid_url(url):
return redirect("/setup/"+url)
return redirect("setup", encodedurl=quote_plus(url))
else:
return render(request, 'homepage.html', {"url": url, "error": url+" is not a valid URL."})
return render(request, 'homepage.html')
def setup(request, url):
if is_valid_url(url):
return render(request, 'setup.html', {"url": url})
def setup(request, encodedurl):
decoded_url = unquote_plus(encodedurl)
if is_valid_url(decoded_url):
return render(request, 'setup.html', {"encodedurl": encodedurl, "url": decoded_url})
else:
return redirect("/")
return redirect("homepage")
def newfeed(request):
if request.method == 'POST':
if not "url" in request.POST or not "element" in request.POST or not "title" in request.POST or not "content" in request.POST:
if not "url" in request.POST or not request.POST["url"] or not "element" in request.POST or not request.POST["element"] or not "title" in request.POST or not request.POST["title"] or not "content" in request.POST or not request.POST["content"]:
return HttpResponse("Error, missing required element")
url = request.POST["url"]
element = request.POST["element"]
@ -101,9 +119,9 @@ def newfeed(request):
feed = Feed(url=url, element=element, title=title, content=content, date=date, author=author, link=link)
feed.save()
return redirect("/feeds")
return redirect("feeds")
else:
return redirect("/")
return redirect("homepage")
def feeds(request):
feeds = Feed.objects.all()
@ -111,15 +129,22 @@ def feeds(request):
def feed_delete(request, id):
try:
Feed.objects.get(pk=id).delete()
except Exception as e:
pass
finally:
return redirect("/feeds")
# demo website: disable deleting feeds
if not request.get_host() == "hrss.hipstercat.fr:443":
Feed.objects.get(pk=id).delete()
logger.info("Removed feed ID "+id)
return redirect("feeds")
else:
return HttpResponse("Deleting is disabled on demo website.", status=403)
except ObjectDoesNotExist:
return redirect("feeds")
def rss(request, uurl):
try:
feed = Feed.objects.get(uurl=uurl)
return render(request, "feed.xml", {"feed": feed, "rss": fetch_feed(feed)}, content_type="application/rss+xml")
fetched = fetch_feed(feed)
if not fetched or not fetched["items"]:
return HttpResponse("Error: feed is empty. Did you set up 'element' field correctly?", status=422)
return render(request, "feed.xml", {"feed": feed, "rss": fetched}, content_type="application/rss+xml")
except ObjectDoesNotExist:
return HttpResponse("nope")
return HttpResponse("Error: feed is unknown", status=404)