Compare commits
16 Commits
Author | SHA1 | Date |
---|---|---|
Amazed | d48f2f5d5c | |
Amazed | 36acc9137d | |
Amazed | 47e24b6459 | |
Amazed | 6dff57aaaa | |
Amazed | 64f6e33b34 | |
Amazed | 3d8566af04 | |
Amazed | 2876fe663d | |
Amazed | 7872bd31af | |
Amazed | af9b1f0c5d | |
Amazed | bbefbea939 | |
Amazed | e2028bbe48 | |
Amazed | a7bbd79cf3 | |
Amazed | f61a6c42c5 | |
Amazed | 91d7839703 | |
Amazed | be90bbd67c | |
Amazed | 336c1cdcc3 |
|
@ -28,7 +28,7 @@ HRSS is an application that allows you to transform any website into a RSS feed.
|
|||
|
||||
[Because we all love demos.](https://hrss.hipstercat.fr)
|
||||
|
||||
# Installation
|
||||
# Installation (native)
|
||||
|
||||
1. Make sure you have Python3 installed and virtualenv.
|
||||
2. Clone this repo for unstable branch, or download [latest release](https://hipstercat.fr/gogs/hipstercat/hrss/releases).
|
||||
|
@ -36,4 +36,7 @@ HRSS is an application that allows you to transform any website into a RSS feed.
|
|||
4. `bin/activate && pip3 install -r requirements.txt` to install dependencies.
|
||||
5. `python3 manage.py migrate` to apply latest DB migrations (and create the H2 database file).
|
||||
6. `python3 manage.py runserver` to run the integrated webserver (not suitable for production use - even though you should not use HRSS in a production environnement at all until it is considered stable anyway). Use uWSGI to run it in a production environnement.
|
||||
|
||||
|
||||
# Installation (Docker)
|
||||
|
||||
Use https://github.com/kmlucy/docker-hrss nicely made by [kmlucy](https://github.com/kmlucy).
|
|
@ -11,6 +11,7 @@ https://docs.djangoproject.com/en/2.0/ref/settings/
|
|||
"""
|
||||
|
||||
import os
|
||||
import urllib.parse
|
||||
|
||||
# Build paths inside the project like this: os.path.join(BASE_DIR, ...)
|
||||
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
|
@ -27,6 +28,9 @@ DEBUG = True
|
|||
|
||||
ALLOWED_HOSTS = []
|
||||
|
||||
# Set up your website base URL here, WITHOUT A TRAILING SLASH.
|
||||
BASE_URL = "http://localhost:8000"
|
||||
|
||||
|
||||
# Application definition
|
||||
|
||||
|
@ -119,7 +123,43 @@ USE_TZ = True
|
|||
# Static files (CSS, JavaScript, Images)
|
||||
# https://docs.djangoproject.com/en/2.0/howto/static-files/
|
||||
|
||||
STATIC_URL = '/static/'
|
||||
STATIC_URL = '/'+urllib.parse.urlparse(BASE_URL).path[1:]+'/static/' if urllib.parse.urlparse(BASE_URL).path[1:] else '/static/'
|
||||
STATIC_ROOT = os.path.join(BASE_DIR, 'static')
|
||||
|
||||
USE_X_FORWARDED_HOST = True
|
||||
USE_X_FORWARDED_HOST = True
|
||||
|
||||
LOGGING = {
|
||||
'version': 1,
|
||||
'disable_existing_loggers': False,
|
||||
'formatters': {
|
||||
'verbose': {
|
||||
'format': '[%(asctime)s] [%(levelname)s] [%(module)s] %(message)s'
|
||||
},
|
||||
'simple': {
|
||||
'format': '%(levelname)s %(message)s'
|
||||
},
|
||||
},
|
||||
'handlers': {
|
||||
'file': {
|
||||
'class': 'logging.FileHandler',
|
||||
'filename': 'debug.log',
|
||||
'formatter': 'verbose'
|
||||
},
|
||||
'console': {
|
||||
'class': 'logging.StreamHandler',
|
||||
'formatter': 'verbose'
|
||||
},
|
||||
},
|
||||
'loggers': {
|
||||
'web': {
|
||||
'handlers': ['console', 'file'],
|
||||
'level': 'DEBUG',
|
||||
},
|
||||
'django': {
|
||||
'handlers': ['console', 'file'],
|
||||
'level': 'ERROR',
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
X_FRAME_OPTIONS = 'SAMEORIGIN'
|
||||
|
|
|
@ -16,8 +16,12 @@ Including another URLconf
|
|||
from django.contrib import admin
|
||||
from django.urls import path
|
||||
from django.conf.urls import include, url
|
||||
from django.conf import settings
|
||||
import urllib.parse
|
||||
base_path = urllib.parse.urlparse(settings.BASE_URL).path[1:]
|
||||
|
||||
|
||||
urlpatterns = [
|
||||
path('admin/', admin.site.urls),
|
||||
url(r'', include('web.urls')),
|
||||
path(base_path+'/admin/' if base_path else 'admin/', admin.site.urls),
|
||||
url(base_path+'/' if base_path else '', include('web.urls')),
|
||||
]
|
||||
|
|
|
@ -0,0 +1,4 @@
|
|||
Django>=2.1.2
|
||||
beautifulsoup4>=4.6.3
|
||||
requests>=2.20.0
|
||||
html5lib>=1.0.1
|
|
@ -1,3 +1,4 @@
|
|||
from django.contrib import admin
|
||||
from .models import Feed
|
||||
|
||||
# Register your models here.
|
||||
admin.site.register(Feed)
|
|
@ -1,11 +1,9 @@
|
|||
from django.conf import settings
|
||||
import urllib.parse
|
||||
|
||||
def BASE_URL(request):
|
||||
"""
|
||||
Return a BASE_URL template context for the current request.
|
||||
"""
|
||||
if request.is_secure():
|
||||
scheme = 'https://'
|
||||
else:
|
||||
scheme = 'http://'
|
||||
fullhost = request.get_host()
|
||||
base = fullhost.split(":")[0]
|
||||
return {'BASE_URL': scheme + base, }
|
||||
parse = urllib.parse.urlparse(settings.BASE_URL)
|
||||
return {'BASE_URL': parse.scheme+"://"+parse.netloc, }
|
|
@ -10,7 +10,7 @@ def random_url():
|
|||
|
||||
class Feed(models.Model):
|
||||
url = models.URLField(max_length=255)
|
||||
element = models.CharField(max_length=255)
|
||||
element = models.CharField(max_length=1000)
|
||||
title = models.CharField(max_length=255)
|
||||
content = models.CharField(max_length=255)
|
||||
date = models.CharField(max_length=255)
|
||||
|
@ -18,4 +18,4 @@ class Feed(models.Model):
|
|||
link = models.CharField(max_length=255)
|
||||
creation_date = models.DateTimeField(auto_now=True)
|
||||
|
||||
uurl = models.CharField(max_length=18, default=random_url, unique=True)
|
||||
uurl = models.CharField(max_length=18, default=random_url, unique=True)
|
||||
|
|
|
@ -19,7 +19,7 @@
|
|||
|
||||
<div id="collapse{{ feed.id }}" class="collapse" aria-labelledby="heading{{ feed.id }}" data-parent="#accordionExample">
|
||||
<div class="card-body">
|
||||
<code>{{ BASE_URL }}/{{ feed.uurl }}.rss</code>
|
||||
<code>{{ BASE_URL }}{% url 'rss' uurl=feed.uurl %}</code>
|
||||
<p><a class="btn btn-danger" href="{% url 'feed_delete' feed.id %}">Delete</a></p>
|
||||
</div>
|
||||
</div>
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
<div class="col-lg-12 text-center">
|
||||
<h1 class="mt-5">Generate RSS out of any website</h1>
|
||||
{% if error %}<div class="alert alert-danger" role="alert">{{ error }}</div>{% endif %}
|
||||
<form action="/" method="post">
|
||||
<form action="{% url 'homepage' %}" method="post">
|
||||
{% csrf_token %}
|
||||
<div class="form-group">
|
||||
<input type="text" class="form-control" name="url" id="url" placeholder="http://" {% if url %}value="{{ url }}"{% endif %}>
|
||||
|
|
|
@ -29,7 +29,7 @@
|
|||
<code style="display:block" id="link-selector"></code>
|
||||
</li>
|
||||
<li>
|
||||
<form action="/newfeed" method="post">
|
||||
<form action="{% url 'newfeed' %}" method="post">
|
||||
{% csrf_token %}
|
||||
<input type="hidden" id="url" name="url" value="{{ url }}">
|
||||
<input type="hidden" id="element" name="element" value="">
|
||||
|
@ -44,7 +44,7 @@
|
|||
</ul>
|
||||
</div>
|
||||
<!-- /#sidebar-wrapper -->
|
||||
<iframe id="preview" style="width:100%" src="/iframe/{{ url }}"></iframe>
|
||||
<iframe id="preview" style="width:100%" src="{% url 'iframe' encodedurl=encodedurl %}"></iframe>
|
||||
<script>
|
||||
$(function() {
|
||||
function handleResize() {
|
||||
|
|
|
@ -26,14 +26,14 @@
|
|||
<!-- Navigation -->
|
||||
<nav class="navbar navbar-expand-lg navbar-dark bg-dark static-top">
|
||||
<div class="container">
|
||||
<a class="navbar-brand" href="/">HRSS</a>
|
||||
<a class="navbar-brand" href="{% url 'homepage' %}">HRSS</a>
|
||||
<button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbarResponsive" aria-controls="navbarResponsive" aria-expanded="false" aria-label="Toggle navigation">
|
||||
<span class="navbar-toggler-icon"></span>
|
||||
</button>
|
||||
<div class="collapse navbar-collapse" id="navbarResponsive">
|
||||
<ul class="navbar-nav ml-auto">
|
||||
<li class="nav-item">
|
||||
<a class="nav-link" href="/">New
|
||||
<a class="nav-link" href="{% url 'homepage' %}">New
|
||||
<span class="sr-only">(current)</span>
|
||||
</a>
|
||||
</li>
|
||||
|
|
|
@ -3,8 +3,8 @@ from . import views
|
|||
|
||||
urlpatterns = [
|
||||
url(r'^$', views.homepage, name='homepage'),
|
||||
url(r'^iframe/(?P<url>.+)$', views.iframe, name='iframe'),
|
||||
url(r'^setup/(?P<url>.+)$', views.setup, name='setup'),
|
||||
url(r'^iframe/(?P<encodedurl>.+)$', views.iframe, name='iframe'),
|
||||
url(r'^setup/(?P<encodedurl>.+)$', views.setup, name='setup'),
|
||||
url(r'^newfeed$', views.newfeed, name='newfeed'),
|
||||
url(r'^feeds$', views.feeds, name='feeds'),
|
||||
url(r'^feeds/delete/(?P<id>[0-9]+)$', views.feed_delete, name='feed_delete'),
|
||||
|
|
43
web/utils.py
43
web/utils.py
|
@ -2,20 +2,25 @@ import requests
|
|||
from django.core.validators import URLValidator
|
||||
from django.core.exceptions import ValidationError
|
||||
from requests.exceptions import RequestException
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def get_url(url):
|
||||
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0"}
|
||||
return requests.get(url, headers=headers)
|
||||
|
||||
def is_valid_url(url):
|
||||
try:
|
||||
URLValidator()(url)
|
||||
requests.get(url).raise_for_status()
|
||||
get_url(url).raise_for_status()
|
||||
return True
|
||||
except ValidationError as e:
|
||||
print(url+" is not a valid url")
|
||||
logger.debug(url+" is not a valid url")
|
||||
return False
|
||||
except RequestException as e:
|
||||
print(url+" led to exception: "+str(e))
|
||||
logger.debug(url+" led to exception: "+str(e))
|
||||
return False
|
||||
|
||||
|
||||
|
@ -23,24 +28,34 @@ def find_longest_common_string(s1, s2):
|
|||
from difflib import SequenceMatcher
|
||||
|
||||
match = SequenceMatcher(None, s1, s2).find_longest_match(0, len(s1), 0, len(s2))
|
||||
return s1[match.a: match.a + match.size] # -> apple pie
|
||||
return s1[match.a: match.a + match.size]
|
||||
|
||||
|
||||
def fetch_feed(feed, limit=10):
|
||||
logger.debug("Fetching feed "+feed.url)
|
||||
items=[]
|
||||
rep = requests.get(feed.url).text
|
||||
soup = BeautifulSoup(rep)
|
||||
elements = soup.select(feed.element)
|
||||
rep = get_url(feed.url).text
|
||||
soup = BeautifulSoup(rep, "html5lib")
|
||||
try:
|
||||
elements = soup.select(feed.element)
|
||||
except Exception as e:
|
||||
logger.debug("Error while fetching elements ("+feed.element+"): "+str(e))
|
||||
return False
|
||||
logger.debug("Found "+str(len(elements))+" news")
|
||||
for element in elements:
|
||||
if len(items) >= limit:
|
||||
break
|
||||
|
||||
logger.debug("Title ("+feed.element+ " > "+feed.title+")")
|
||||
try:
|
||||
title = element.select(feed.title)[0].text
|
||||
except Exception:
|
||||
logger.debug("Match title "+title)
|
||||
except Exception as e:
|
||||
logger.debug("Error while selecting feed title: "+str(e))
|
||||
return False
|
||||
try:
|
||||
content = element.select(feed.content)[0].text
|
||||
except Exception:
|
||||
except Exception as e:
|
||||
logger.debug("Error while selecting content: "+str(e))
|
||||
return False
|
||||
try:
|
||||
date = element.select(feed.date)[0].text
|
||||
|
@ -52,6 +67,12 @@ def fetch_feed(feed, limit=10):
|
|||
author = False
|
||||
try:
|
||||
link = element.select(feed.link)[0]["href"]
|
||||
if link and len(link) > 2 and link[0] == "/" and link[1] != "/":
|
||||
# fixes issue #5 with relative link
|
||||
# prepend base url
|
||||
base_scheme = feed.url.split("://")[0]
|
||||
base_url = feed.url.split("//")[-1].split("/")[0].split('?')[0]
|
||||
link = base_scheme + "://" + base_url + link
|
||||
except Exception:
|
||||
link = False
|
||||
items.append({"title": title, "content": content, "pubDate": date, "author": author, "link": link})
|
||||
|
|
63
web/views.py
63
web/views.py
|
@ -4,22 +4,30 @@ from .utils import *
|
|||
from .models import Feed
|
||||
from django.db.models import ObjectDoesNotExist
|
||||
from bs4 import BeautifulSoup
|
||||
import logging
|
||||
from urllib.parse import quote_plus, unquote_plus
|
||||
import traceback
|
||||
import sys
|
||||
# Create your views here.
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def iframe(request, url):
|
||||
content_type = False
|
||||
def iframe(request, encodedurl):
|
||||
sys.setrecursionlimit(10000)
|
||||
try:
|
||||
req = requests.get(url)
|
||||
url = unquote_plus(encodedurl)
|
||||
req = get_url(url)
|
||||
html = req.content
|
||||
bs = False
|
||||
|
||||
content_type = req.headers["Content-Type"] if "Content-Type" in req.headers else False
|
||||
|
||||
if not content_type or content_type.startswith("text/html"):
|
||||
logger.debug("No content-type or content-type ~= '^text/html'")
|
||||
bs = BeautifulSoup(html, 'html.parser')
|
||||
base_scheme = url.split("://")[0]
|
||||
base_url = url.split("//")[-1].split("/")[0].split('?')[0]
|
||||
logger.debug("URL: "+base_scheme+"://"+base_url)
|
||||
|
||||
# fixes
|
||||
# fix click links
|
||||
|
@ -33,6 +41,11 @@ def iframe(request, url):
|
|||
if link["href"].startswith("/"):
|
||||
link["href"] = "/iframe/" + base_scheme + "://" + base_url + link["href"]
|
||||
|
||||
# test: remove js
|
||||
all_scripts = bs.find_all("script")
|
||||
for script in all_scripts:
|
||||
script.extract()
|
||||
|
||||
# fix absolute javascript
|
||||
all_scripts = bs.find_all("script", {"src": True})
|
||||
for script in all_scripts:
|
||||
|
@ -48,32 +61,37 @@ def iframe(request, url):
|
|||
html = final_html
|
||||
|
||||
except Exception as e:
|
||||
html = str(e)
|
||||
traceback.print_exc()
|
||||
return HttpResponse("An error has occured", content_type=500)
|
||||
return HttpResponse(html, content_type=content_type)
|
||||
|
||||
|
||||
def dummy(request):
|
||||
return HttpResponse("toto")
|
||||
|
||||
|
||||
def homepage(request):
|
||||
if request.method == 'POST':
|
||||
if "url" in request.POST and request.POST["url"]:
|
||||
url = request.POST["url"]
|
||||
if is_valid_url(url):
|
||||
return redirect("/setup/"+url)
|
||||
return redirect("setup", encodedurl=quote_plus(url))
|
||||
else:
|
||||
return render(request, 'homepage.html', {"url": url, "error": url+" is not a valid URL."})
|
||||
return render(request, 'homepage.html')
|
||||
|
||||
def setup(request, url):
|
||||
if is_valid_url(url):
|
||||
return render(request, 'setup.html', {"url": url})
|
||||
|
||||
def setup(request, encodedurl):
|
||||
decoded_url = unquote_plus(encodedurl)
|
||||
if is_valid_url(decoded_url):
|
||||
return render(request, 'setup.html', {"encodedurl": encodedurl, "url": decoded_url})
|
||||
else:
|
||||
return redirect("/")
|
||||
return redirect("homepage")
|
||||
|
||||
|
||||
def newfeed(request):
|
||||
if request.method == 'POST':
|
||||
if not "url" in request.POST or not "element" in request.POST or not "title" in request.POST or not "content" in request.POST:
|
||||
if not "url" in request.POST or not request.POST["url"] or not "element" in request.POST or not request.POST["element"] or not "title" in request.POST or not request.POST["title"] or not "content" in request.POST or not request.POST["content"]:
|
||||
return HttpResponse("Error, missing required element")
|
||||
url = request.POST["url"]
|
||||
element = request.POST["element"]
|
||||
|
@ -101,9 +119,9 @@ def newfeed(request):
|
|||
feed = Feed(url=url, element=element, title=title, content=content, date=date, author=author, link=link)
|
||||
feed.save()
|
||||
|
||||
return redirect("/feeds")
|
||||
return redirect("feeds")
|
||||
else:
|
||||
return redirect("/")
|
||||
return redirect("homepage")
|
||||
|
||||
def feeds(request):
|
||||
feeds = Feed.objects.all()
|
||||
|
@ -111,15 +129,22 @@ def feeds(request):
|
|||
|
||||
def feed_delete(request, id):
|
||||
try:
|
||||
Feed.objects.get(pk=id).delete()
|
||||
except Exception as e:
|
||||
pass
|
||||
finally:
|
||||
return redirect("/feeds")
|
||||
# demo website: disable deleting feeds
|
||||
if not request.get_host() == "hrss.hipstercat.fr:443":
|
||||
Feed.objects.get(pk=id).delete()
|
||||
logger.info("Removed feed ID "+id)
|
||||
return redirect("feeds")
|
||||
else:
|
||||
return HttpResponse("Deleting is disabled on demo website.", status=403)
|
||||
except ObjectDoesNotExist:
|
||||
return redirect("feeds")
|
||||
|
||||
def rss(request, uurl):
|
||||
try:
|
||||
feed = Feed.objects.get(uurl=uurl)
|
||||
return render(request, "feed.xml", {"feed": feed, "rss": fetch_feed(feed)}, content_type="application/rss+xml")
|
||||
fetched = fetch_feed(feed)
|
||||
if not fetched or not fetched["items"]:
|
||||
return HttpResponse("Error: feed is empty. Did you set up 'element' field correctly?", status=422)
|
||||
return render(request, "feed.xml", {"feed": feed, "rss": fetched}, content_type="application/rss+xml")
|
||||
except ObjectDoesNotExist:
|
||||
return HttpResponse("nope")
|
||||
return HttpResponse("Error: feed is unknown", status=404)
|
Loading…
Reference in New Issue