titles: use rdrama's title finding code (#425)

* titles: use rdrama's title finding code this fixes a potential DoS in some really weird pages (seems to be a bug with BS4) we're not parsing arbitrary HTML in addition we make some nice checks * unescape title to fix bug from upstream * fix nameerror * Do not proxy requests, since no proxy available. On the upstream, the `proxies` dict was intended to use a local SOCKS proxy running on port 18080 with the express purpose of masking the server IP address. TheMotte isn't running behind a reverse proxy, so this purpose is moot. Additionally, we don't have a proxy running in Docker nor do we appear to have one on prod, which breaks autotitle and thumbnailing regardless--not sure it matters for TheMotte's use case, but both codepaths have been inoperative because of it. * use gevent to timeout the function to prevent a second theoretical DoS by sending data rly slowly ref: 816389cf28 Co-authored-by: TLSM <duolsm@outlook.com>
2022-11-28 07:52:15 -08:00 · 2022-11-28 07:52:15 -08:00 · 8a9e1bc54a
commit 8a9e1bc54a
parent c3f4b540d0
2 changed files with 29 additions and 10 deletions
--- a/files/helpers/const.py
+++ b/files/helpers/const.py
@ -44,6 +44,12 @@ COLORS = {'ff66ac','805ad5','62ca56','38a169','80ffff','2a96f3','eb4963','ff0000

 LOGGEDIN_ACTIVE_TIME = 15 * 60

+IMAGE_FORMATS = ['png','gif','jpg','jpeg','webp']
+VIDEO_FORMATS = ['mp4','webm','mov','avi','mkv','flv','m4v','3gp']
+AUDIO_FORMATS = ['mp3','wav','ogg','aac','m4a','flac']
+NO_TITLE_EXTENSIONS = IMAGE_FORMATS + VIDEO_FORMATS + AUDIO_FORMATS
+
+
 AWARDS = {
 	"lootbox": {
 		"kind": "lootbox",
@ -250,7 +256,7 @@ utm_regex2 = re.compile('[?&]utm_[a-z]+=[a-z0-9_]+', flags=re.A)

 YOUTUBE_KEY = environ.get("YOUTUBE_KEY", "").strip()

-proxies = {"http":"http://127.0.0.1:18080","https":"http://127.0.0.1:18080"}
+proxies = {}

 approved_embed_hosts = [
 	'rdrama.net',
@ -324,4 +330,6 @@ procoins_li = (0,2500,5000,10000,25000,50000,125000,250000)

 linefeeds_regex = re.compile("([^\\n])\\n([^\\n])", flags=re.A)

+html_title_regex = re.compile("<title>(.{1,200})</title>", flags=re.I)
+
 def make_name(*args, **kwargs): return request.base_url
--- a/files/routes/posts.py
+++ b/files/routes/posts.py
@ -1139,21 +1139,32 @@ def api_pin_post(post_id, v):
 		else: return {"message": "Post unpinned!"}
 	return {"error": "Post not found!"}

-
@app.get("/submit/title")
@limiter.limit("6/minute")
@auth_required
 def get_post_title(v):
-
+	POST_TITLE_TIMEOUT = 5
 	url = request.values.get("url")
-	if not url: abort(400)
+	if not url or '\\' in url: abort(400)
+	url = url.strip()
+	if not url.startswith('http'): abort(400)

-	try: x = requests.get(url, headers=titleheaders, timeout=5, proxies=proxies)
+	checking_url = url.lower().split('?')[0].split('%3F')[0]
+	if any((checking_url.endswith(f'.{x}') for x in NO_TITLE_EXTENSIONS)):
+		abort(400)
+
+	try:
+		x = gevent.with_timeout(POST_TITLE_TIMEOUT, requests.get, 
+			                    url, headers=titleheaders, timeout=POST_TITLE_TIMEOUT, 
+							    proxies=proxies)
 	except: abort(400)
+		
+	content_type = x.headers.get("Content-Type")
+	if not content_type or "text/html" not in content_type: abort(400)

-	soup = BeautifulSoup(x.content, 'lxml')
+	match = html_title_regex.search(x.text)
+	if match and match.lastindex >= 1:
+		title = html.unescape(match.group(1))
+	else: abort(400)

-	title = soup.find('title')
-	if not title: abort(400)
-
-	return {"url": url, "title": title.string}
+	return {"url": url, "title": title}