sanitize: prevent worker crash during timeouts
because themotte is ~~webscale~~ async, there is some window of time where code in the sanitize fn will cause the entire worker to crash and not in a fun way. anyway this uses gevent to handle timeouts instead of a signal based system which is non-portable and is very fragile, especially if themotte adds more async functionality in (something that would probably further improve the performance of the site) essentially... we don't want the failure of one request taking down the entire worker!
This commit is contained in:
parent
9ade35d22f
commit
1e9ca62892
1 changed files with 38 additions and 41 deletions
|
@ -10,21 +10,43 @@ import re
|
||||||
from mistletoe import markdown
|
from mistletoe import markdown
|
||||||
from json import loads, dump
|
from json import loads, dump
|
||||||
from random import random, choice
|
from random import random, choice
|
||||||
import signal
|
import gevent
|
||||||
import time
|
import time
|
||||||
import requests
|
import requests
|
||||||
from files.__main__ import app
|
from files.__main__ import app
|
||||||
|
|
||||||
TLDS = ('ac','ad','ae','aero','af','ag','ai','al','am','an','ao','aq','ar','arpa','as','asia','at','au','aw','ax','az','ba','bb','bd','be','bf','bg','bh','bi','biz','bj','bm','bn','bo','br','bs','bt','bv','bw','by','bz','ca','cafe','cat','cc','cd','cf','cg','ch','ci','ck','cl','club','cm','cn','co','com','coop','cr','cu','cv','cx','cy','cz','de','dj','dk','dm','do','dz','ec','edu','ee','eg','er','es','et','eu','fi','fj','fk','fm','fo','fr','ga','gb','gd','ge','gf','gg','gh','gi','gl','gm','gn','gov','gp','gq','gr','gs','gt','gu','gw','gy','hk','hm','hn','hr','ht','hu','id','ie','il','im','in','info','int','io','iq','ir','is','it','je','jm','jo','jobs','jp','ke','kg','kh','ki','km','kn','kp','kr','kw','ky','kz','la','lb','lc','li','lk','lr','ls','lt','lu','lv','ly','ma','mc','md','me','mg','mh','mil','mk','ml','mm','mn','mo','mobi','mp','mq','mr','ms','mt','mu','museum','mv','mw','mx','my','mz','na','name','nc','ne','net','nf','ng','ni','nl','no','np','nr','nu','nz','om','org','pa','pe','pf','pg','ph','pk','pl','pm','pn','post','pr','pro','ps','pt','pw','py','qa','re','ro','rs','ru','rw','sa','sb','sc','sd','se','sg','sh','si','sj','sk','sl','sm','sn','so','social','sr','ss','st','su','sv','sx','sy','sz','tc','td','tel','tf','tg','th','tj','tk','tl','tm','tn','to','tp','tr','travel','tt','tv','tw','tz','ua','ug','uk','us','uy','uz','va','vc','ve','vg','vi','vn','vu','wf','win','ws','xn','xxx','xyz','ye','yt','yu','za','zm','zw', 'moe')
|
TLDS = ('ac','ad','ae','aero','af','ag','ai','al','am','an','ao','aq','ar',
|
||||||
|
'arpa','as','asia','at','au','aw','ax','az','ba','bb','bd','be','bf','bg',
|
||||||
|
'bh','bi','biz','bj','bm','bn','bo','br','bs','bt','bv','bw','by','bz',
|
||||||
|
'ca','cafe','cat','cc','cd','cf','cg','ch','ci','ck','cl','club','cm',
|
||||||
|
'cn','co','com','coop','cr','cu','cv','cx','cy','cz','de','dj','dk','dm',
|
||||||
|
'do','dz','ec','edu','ee','eg','er','es','et','eu','fi','fj','fk','fm',
|
||||||
|
'fo','fr','ga','gb','gd','ge','gf','gg','gh','gi','gl','gm','gn','gov',
|
||||||
|
'gp','gq','gr','gs','gt','gu','gw','gy','hk','hm','hn','hr','ht','hu',
|
||||||
|
'id','ie','il','im','in','info','int','io','iq','ir','is','it','je','jm',
|
||||||
|
'jo','jobs','jp','ke','kg','kh','ki','km','kn','kp','kr','kw','ky','kz',
|
||||||
|
'la','lb','lc','li','lk','lr','ls','lt','lu','lv','ly','ma','mc','md','me',
|
||||||
|
'mg','mh','mil','mk','ml','mm','mn','mo','mobi','mp','mq','mr','ms','mt',
|
||||||
|
'mu','museum','mv','mw','mx','my','mz','na','name','nc','ne','net','nf',
|
||||||
|
'ng','ni','nl','no','np','nr','nu','nz','om','org','pa','pe','pf','pg',
|
||||||
|
'ph','pk','pl','pm','pn','post','pr','pro','ps','pt','pw','py','qa','re',
|
||||||
|
'ro','rs','ru','rw','sa','sb','sc','sd','se','sg','sh','si','sj','sk',
|
||||||
|
'sl','sm','sn','so','social','sr','ss','st','su','sv','sx','sy','sz',
|
||||||
|
'tc','td','tel','tf','tg','th','tj','tk','tl','tm','tn','to','tp','tr',
|
||||||
|
'travel','tt','tv','tw','tz','ua','ug','uk','us','uy','uz','va','vc','ve',
|
||||||
|
'vg','vi','vn','vu','wf','win','ws','xn','xxx','xyz','ye','yt','yu','za',
|
||||||
|
'zm','zw', 'moe')
|
||||||
|
|
||||||
allowed_tags = ('b','blockquote','br','code','del','em','h1','h2','h3','h4','h5','h6','hr','i','li','ol','p','pre','strong','sub','sup','table','tbody','th','thead','td','tr','ul','a','span','ruby','rp','rt','spoiler',)
|
allowed_tags = ('b','blockquote','br','code','del','em','h1','h2','h3','h4',
|
||||||
|
'h5','h6','hr','i','li','ol','p','pre','strong','sub','sup','table',
|
||||||
|
'tbody','th','thead','td','tr','ul','a','span','ruby','rp','rt',
|
||||||
|
'spoiler',)
|
||||||
|
|
||||||
if app.config['MULTIMEDIA_EMBEDDING_ENABLED']:
|
if app.config['MULTIMEDIA_EMBEDDING_ENABLED']:
|
||||||
allowed_tags += ('img', 'lite-youtube', 'video', 'source',)
|
allowed_tags += ('img', 'lite-youtube', 'video', 'source',)
|
||||||
|
|
||||||
|
|
||||||
def allowed_attributes(tag, name, value):
|
def allowed_attributes(tag, name, value):
|
||||||
|
|
||||||
if name == 'style': return True
|
if name == 'style': return True
|
||||||
|
|
||||||
if tag == 'a':
|
if tag == 'a':
|
||||||
|
@ -123,31 +145,21 @@ def render_emoji(html, regexp, edit, marseys_used=set(), b=False):
|
||||||
return html
|
return html
|
||||||
|
|
||||||
|
|
||||||
def with_sigalrm_timeout(timeout: int):
|
def with_gevent_timeout(timeout: int):
|
||||||
'Use SIGALRM to raise an exception if the function executes for longer than timeout seconds'
|
'''
|
||||||
|
Use gevent to raise an exception if the function executes for longer than timeout seconds
|
||||||
# while trying to test this using time.sleep I discovered that gunicorn does in fact do some
|
Using gevent instead of a signal based approach allows for proper async and avoids some
|
||||||
# async so if we timeout on that (or on a db op) then the process is crashed without returning
|
worker crashes
|
||||||
# a proper 500 error. Oh well.
|
'''
|
||||||
def sig_handler(signum, frame):
|
|
||||||
print("Timeout!", flush=True)
|
|
||||||
raise Exception("Timeout")
|
|
||||||
|
|
||||||
def inner(func):
|
def inner(func):
|
||||||
@functools.wraps(inner)
|
@functools.wraps(func)
|
||||||
def wrapped(*args, **kwargs):
|
def wrapped(*args, **kwargs):
|
||||||
signal.signal(signal.SIGALRM, sig_handler)
|
return gevent.with_timeout(timeout, func, *args, **kwargs)
|
||||||
signal.alarm(timeout)
|
|
||||||
try:
|
|
||||||
return func(*args, **kwargs)
|
|
||||||
finally:
|
|
||||||
signal.alarm(0)
|
|
||||||
return wrapped
|
return wrapped
|
||||||
return inner
|
return inner
|
||||||
|
|
||||||
@with_sigalrm_timeout(2)
|
@with_gevent_timeout(2)
|
||||||
def sanitize(sanitized, alert=False, comment=False, edit=False):
|
def sanitize(sanitized, alert=False, comment=False, edit=False):
|
||||||
|
|
||||||
# double newlines, eg. hello\nworld becomes hello\n\nworld, which later becomes <p>hello</p><p>world</p>
|
# double newlines, eg. hello\nworld becomes hello\n\nworld, which later becomes <p>hello</p><p>world</p>
|
||||||
sanitized = linefeeds_regex.sub(r'\1\n\n\2', sanitized)
|
sanitized = linefeeds_regex.sub(r'\1\n\n\2', sanitized)
|
||||||
|
|
||||||
|
@ -190,11 +202,7 @@ def sanitize(sanitized, alert=False, comment=False, edit=False):
|
||||||
users = get_users(names,graceful=True)
|
users = get_users(names,graceful=True)
|
||||||
|
|
||||||
if len(users) > app.config['MENTION_LIMIT']:
|
if len(users) > app.config['MENTION_LIMIT']:
|
||||||
signal.alarm(0)
|
abort(400, f'Mentioned {len(users)} users but limit is {app.config["MENTION_LIMIT"]}')
|
||||||
abort(
|
|
||||||
make_response(
|
|
||||||
jsonify(
|
|
||||||
error=f'Mentioned {len(users)} users but limit is {app.config["MENTION_LIMIT"]}'), 400))
|
|
||||||
|
|
||||||
for u in users:
|
for u in users:
|
||||||
if not u: continue
|
if not u: continue
|
||||||
|
@ -281,12 +289,8 @@ def sanitize(sanitized, alert=False, comment=False, edit=False):
|
||||||
sanitized = sanitized.replace('&','&')
|
sanitized = sanitized.replace('&','&')
|
||||||
sanitized = utm_regex.sub('', sanitized)
|
sanitized = utm_regex.sub('', sanitized)
|
||||||
sanitized = utm_regex2.sub('', sanitized)
|
sanitized = utm_regex2.sub('', sanitized)
|
||||||
|
|
||||||
|
|
||||||
sanitized = sanitized.replace('<html><body>','').replace('</body></html>','')
|
sanitized = sanitized.replace('<html><body>','').replace('</body></html>','')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
sanitized = bleach.Cleaner(tags=allowed_tags,
|
sanitized = bleach.Cleaner(tags=allowed_tags,
|
||||||
attributes=allowed_attributes,
|
attributes=allowed_attributes,
|
||||||
protocols=['http', 'https'],
|
protocols=['http', 'https'],
|
||||||
|
@ -321,17 +325,11 @@ def sanitize(sanitized, alert=False, comment=False, edit=False):
|
||||||
domain_list.add(new_domain)
|
domain_list.add(new_domain)
|
||||||
|
|
||||||
bans = g.db.query(BannedDomain.domain).filter(BannedDomain.domain.in_(list(domain_list))).all()
|
bans = g.db.query(BannedDomain.domain).filter(BannedDomain.domain.in_(list(domain_list))).all()
|
||||||
|
|
||||||
if bans: abort(403, description=f"Remove the banned domains {bans} and try again!")
|
if bans: abort(403, description=f"Remove the banned domains {bans} and try again!")
|
||||||
|
|
||||||
return sanitized
|
return sanitized
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def allowed_attributes_emojis(tag, name, value):
|
def allowed_attributes_emojis(tag, name, value):
|
||||||
|
|
||||||
if tag == 'img':
|
if tag == 'img':
|
||||||
if name == 'loading' and value == 'lazy': return True
|
if name == 'loading' and value == 'lazy': return True
|
||||||
if name == 'data-bs-toggle' and value == 'tooltip': return True
|
if name == 'data-bs-toggle' and value == 'tooltip': return True
|
||||||
|
@ -339,9 +337,8 @@ def allowed_attributes_emojis(tag, name, value):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
@with_sigalrm_timeout(1)
|
@with_gevent_timeout(1)
|
||||||
def filter_emojis_only(title, edit=False, graceful=False):
|
def filter_emojis_only(title, edit=False, graceful=False):
|
||||||
|
|
||||||
title = unwanted_bytes_regex.sub('', title)
|
title = unwanted_bytes_regex.sub('', title)
|
||||||
title = whitespace_regex.sub(' ', title)
|
title = whitespace_regex.sub(' ', title)
|
||||||
title = html.escape(title, quote=True)
|
title = html.escape(title, quote=True)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue