From 1b9f7860c5618fa0a5bed618eb3a5da4a7cedc56 Mon Sep 17 00:00:00 2001 From: Jack Byrne Date: Sat, 10 Sep 2022 18:56:13 +0100 Subject: [PATCH] nicer and more efficient sanitisation --- files/classes/submission.py | 4 ++-- files/helpers/const.py | 7 +++++++ files/helpers/sanitize.py | 7 +++++-- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/files/classes/submission.py b/files/classes/submission.py index a78ca04b9..c4fddcead 100644 --- a/files/classes/submission.py +++ b/files/classes/submission.py @@ -396,9 +396,9 @@ class Submission(Base): @lazy def realtitle(self, v): if self.title_html: - return self.title_html + return self.title_html else: - return self.title + return self.title @lazy def plaintitle(self, v): diff --git a/files/helpers/const.py b/files/helpers/const.py index a304e74c4..2a14cbf8c 100644 --- a/files/helpers/const.py +++ b/files/helpers/const.py @@ -478,6 +478,13 @@ spoiler_regex = re.compile('''\|\|(.+)\|\|''', flags=re.A) reddit_regex = re.compile('(^|\s|

)\/?((r|u)\/(\w|-){3,25})(?![^<]*<\/(code|pre|a)>)', flags=re.A) sub_regex = re.compile('(^|\s|

)\/?(h\/(\w|-){3,25})', flags=re.A) +# Bytes that shouldn't be allowed in user-submitted text +# U+200E is LTR toggle, U+200F is RTL toggle, U+200B and U+FEFF are Zero-Width Spaces, +# and U+1242A is a massive and terrifying cuneiform numeral +unwanted_bytes_regex = re.compile("\u200e|\u200f|\u200b|\ufeff|\U0001242a") + +whitespace_regex = re.compile('\s+') + strikethrough_regex = re.compile('''~{1,2}([^~]+)~{1,2}''', flags=re.A) mute_regex = re.compile("/mute @([a-z0-9_\-]{3,25}) ([0-9])+", flags=re.A) diff --git a/files/helpers/sanitize.py b/files/helpers/sanitize.py index ce8b15061..c168747aa 100644 --- a/files/helpers/sanitize.py +++ b/files/helpers/sanitize.py @@ -1,4 +1,5 @@ import functools +import html import bleach from bs4 import BeautifulSoup from bleach.linkifier import LinkifyFilter, build_url_re @@ -166,7 +167,7 @@ def sanitize(sanitized, alert=False, comment=False, edit=False): sanitized = strikethrough_regex.sub(r'\1', sanitized) # remove left-to-right mark; remove zero width space; remove zero width no-break space; remove Cuneiform Numeric Sign Eight; - sanitized = sanitized.replace('‎','').replace('​','').replace("\ufeff", "").replace("𒐪","") + sanitized = unwanted_bytes_regex.sub('', sanitized) if alert: matches = { g.group(1):g for g in mention_regex2.finditer(sanitized) if g } @@ -340,7 +341,9 @@ def allowed_attributes_emojis(tag, name, value): @with_sigalrm_timeout(1) def filter_emojis_only(title, edit=False, graceful=False): - title = title.replace('‎','').replace('​','').replace("\ufeff", "").replace("𒐪","").replace("\n", "").replace("\r", "").replace("\t", "").replace("&", "&").replace('<','<').replace('>','>').replace('"', '"').replace("'", "'").strip() + title = unwanted_bytes_regex.sub('', title) + title = whitespace_regex.sub(' ', title) + title = html.escape(title, quote=True) # title = render_emoji(title, emoji_regex3, edit)