diff --git a/files/helpers/sanitize.py b/files/helpers/sanitize.py index a19637df0..65be0a660 100644 --- a/files/helpers/sanitize.py +++ b/files/helpers/sanitize.py @@ -10,8 +10,6 @@ from json import loads, dump from random import random, choice import signal import time -from urllib.parse import ParseResult, urlunparse, urlparse - allowed_tags = tags = ['b', 'blockquote', @@ -117,13 +115,7 @@ def sanitize(sanitized, noimages=False, alert=False, comment=False, edit=False): sanitized = strikethrough_regex.sub(r'\1', sanitized) - sanitized = sanitized.replace("\ufeff", "").replace("𒐪","").replace("","").replace('‎','').replace("https://youtu.be/", "https://youtube.com/watch?v=").replace("https://music.youtube.com/watch?v=", "https://youtube.com/watch?v=").replace("https://streamable.com/", "https://streamable.com/e/").replace("https://youtube.com/shorts/", "https://youtube.com/watch?v=").replace("https://mobile.twitter", "https://twitter").replace("https://m.facebook", "https://facebook").replace("m.wikipedia.org", "wikipedia.org").replace("https://m.youtube", "https://youtube").replace("https://www.youtube", "https://youtube").replace("old.reddit.com/gallery", "reddit.com/gallery") - - if "https://youtube.com/watch?v=" in sanitized: sanitized = sanitized.replace("?t=", "&t=") - - for rd in ["://reddit.com", "://new.reddit.com", "://www.reddit.com", "://redd.it", "://libredd.it"]: - sanitized = sanitized.replace(rd, "://old.reddit.com") - + sanitized = sanitized.replace("\ufeff", "").replace("𒐪","").replace("","").replace('‎','') if alert: captured = [] @@ -203,32 +195,6 @@ def sanitize(sanitized, noimages=False, alert=False, comment=False, edit=False): except: tag.string = "" - parsed_url = urlparse(tag.get("href")) - - domain = parsed_url.netloc - if domain == 'old.reddit.com': - new_url = ParseResult(scheme="https", - netloc=parsed_url.netloc, - path=parsed_url.path, - params=parsed_url.params, - query=None, - fragment=parsed_url.fragment) - else: - qd = parse_qs(parsed_url.query) - filtered = {k: val for k, val in qd.items() if not k.startswith('utm_') and not k.startswith('ref_')} - - new_url = ParseResult(scheme="https", - netloc=parsed_url.netloc, - path=parsed_url.path, - params=parsed_url.params, - query=urlencode(filtered, doseq=True), - fragment=parsed_url.fragment) - - - new_url = urlunparse(new_url) - if tag.string == tag["href"]: tag.string = new_url - tag["href"] = new_url - sanitized = str(soup) @@ -310,6 +276,10 @@ def sanitize(sanitized, noimages=False, alert=False, comment=False, edit=False): sanitized = re.sub(f'(?', sanitized, flags=re.I|re.A) if comment: marseys_used.add(emoji) + sanitized = sanitized.replace("https://youtu.be/", "https://youtube.com/watch?v=").replace("https://music.youtube.com/watch?v=", "https://youtube.com/watch?v=").replace("https://streamable.com/", "https://streamable.com/e/").replace("https://youtube.com/shorts/", "https://youtube.com/watch?v=").replace("https://mobile.twitter", "https://twitter").replace("https://m.facebook", "https://facebook").replace("m.wikipedia.org", "wikipedia.org").replace("https://m.youtube", "https://youtube").replace("https://www.youtube", "https://youtube") + + if "https://youtube.com/watch?v=" in sanitized: sanitized = sanitized.replace("?t=", "&t=") + captured = [] for i in youtube_regex.finditer(sanitized): if i.group(0) in captured: continue @@ -329,6 +299,11 @@ def sanitize(sanitized, noimages=False, alert=False, comment=False, edit=False): sanitized = sanitized.replace(replacing, htmlsource) + for rd in ["://reddit.com", "://new.reddit.com", "://www.reddit.com", "://redd.it", "://libredd.it"]: + sanitized = sanitized.replace(rd, "://old.reddit.com") + + sanitized = sanitized.replace("old.reddit.com/gallery", "reddit.com/gallery") + sanitized = unlinked_regex.sub(r'\1\2', sanitized)