import bleach from bs4 import BeautifulSoup from bleach.linkifier import LinkifyFilter, build_url_re from functools import partial from .get import * from .patter import pat from os import path, environ import re from mistletoe import markdown from json import loads, dump from random import random, choice import signal import time import requests TLDS = ('ac','ad','ae','aero','af','ag','ai','al','am','an','ao','aq','ar','arpa','as','asia','at','au','aw','ax','az','ba','bb','bd','be','bf','bg','bh','bi','biz','bj','bm','bn','bo','br','bs','bt','bv','bw','by','bz','ca','cafe','cat','cc','cd','cf','cg','ch','ci','ck','cl','club','cm','cn','co','com','coop','cr','cu','cv','cx','cy','cz','de','dj','dk','dm','do','dz','ec','edu','ee','eg','er','es','et','eu','fi','fj','fk','fm','fo','fr','ga','gb','gd','ge','gf','gg','gh','gi','gl','gm','gn','gov','gp','gq','gr','gs','gt','gu','gw','gy','hk','hm','hn','hr','ht','hu','id','ie','il','im','in','info','int','io','iq','ir','is','it','je','jm','jo','jobs','jp','ke','kg','kh','ki','km','kn','kp','kr','kw','ky','kz','la','lb','lc','li','lk','lr','ls','lt','lu','lv','ly','ma','mc','md','me','mg','mh','mil','mk','ml','mm','mn','mo','mobi','mp','mq','mr','ms','mt','mu','museum','mv','mw','mx','my','mz','na','name','nc','ne','net','nf','ng','ni','nl','no','np','nr','nu','nz','om','org','pa','pe','pf','pg','ph','pk','pl','pm','pn','post','pr','pro','ps','pt','pw','py','qa','re','ro','rs','ru','rw','sa','sb','sc','sd','se','sg','sh','si','sj','sk','sl','sm','sn','so','social','sr','ss','st','su','sv','sx','sy','sz','tc','td','tel','tf','tg','th','tj','tk','tl','tm','tn','to','tp','tr','travel','tt','tv','tw','tz','ua','ug','uk','us','uy','uz','va','vc','ve','vg','vi','vn','vu','wf','win','ws','xn','xxx','ye','yt','yu','za','zm','zw') allowed_tags = ('b','blockquote','br','code','del','em','h1','h2','h3','h4','h5','h6','hr','i','li','ol','p','pre','strong','sub','sup','table','tbody','th','thead','td','tr','ul','marquee','a','span','ruby','rp','rt','spoiler','img','lite-youtube','video','source') def allowed_attributes(tag, name, value): if name == 'style': return True if tag == 'marquee': if name in ['direction', 'behavior', 'scrollamount']: return True if name in {'height', 'width'}: try: value = int(value.replace('px', '')) except: return False if 0 < value <= 250: return True return False if tag == 'a': if name == 'href': return True if name == 'rel' and value == 'nofollow noopener noreferrer': return True if name == 'target' and value == '_blank': return True return False if tag == 'img': if name in ['src','data-src']: if value.startswith('/') or value.startswith(f'{SITE_FULL}/') or embed_fullmatch_regex.fullmatch(value): return True else: return False if name == 'loading' and value == 'lazy': return True if name == 'referrpolicy' and value == 'no-referrer': return True if name == 'data-bs-toggle' and value == 'tooltip': return True if name in ['alt','title','g','b','pat']: return True if name == 'class' and value == 'pat-hand': return True return False if tag == 'lite-youtube': if name == 'params' and value.startswith('autoplay=1&modestbranding=1'): return True if name == 'videoid': return True return False if tag == 'video': if name == 'controls' and value == '': return True if name == 'preload' and value == 'none': return True return False if tag == 'source': if name == 'src' and embed_fullmatch_regex.fullmatch(value): return True return False if tag == 'p': if name == 'class' and value == 'mb-0': return True return False if tag == 'span': if name == 'class' and value in ['pat-container', 'pat-hand']: return True if name == 'data-bs-toggle' and value == 'tooltip': return True if name == 'title': return True if name == 'alt': return True return False url_re = build_url_re(tlds=TLDS, protocols=['http', 'https']) def callback(attrs, new=False): href = attrs[(None, "href")] if not href.startswith('/') and not href.startswith(f'{SITE_FULL}/'): attrs[(None, "target")] = "_blank" attrs[(None, "rel")] = "nofollow noopener noreferrer" return attrs def handler(signum, frame): print("Timeout!") raise Exception("Timeout") def render_emoji(html, regexp, edit, marseys_used=set(), b=False): emojis = list(regexp.finditer(html)) captured = set() for i in emojis: if i.group(0) in captured: continue captured.add(i.group(0)) emoji = i.group(1).lower() attrs = '' if b: attrs += ' b' if not edit and len(emojis) <= 20 and random() < 0.0025 and ('marsey' in emoji or emoji in marseys_const2): attrs += ' g' old = emoji emoji = emoji.replace('!','').replace('#','') if emoji == 'marseyrandom': emoji = choice(marseys_const2) emoji_partial = ':{0}:' emoji_html = None if emoji.endswith('pat'): if path.isfile(f"files/assets/images/emojis/{emoji.replace('pat','')}.webp"): attrs += ' pat' emoji_html = f'{emoji_partial.format(old, f"/e/{emoji[:-3]}.webp", attrs)}' elif emoji.startswith('@'): if u := get_user(emoji[1:-3], graceful=True): attrs += ' pat' emoji_html = f'{emoji_partial.format(old, f"/pp/{u.id}", attrs)}' elif path.isfile(f'files/assets/images/emojis/{emoji}.webp'): emoji_html = emoji_partial.format(old, f'/e/{emoji}.webp', attrs) if emoji_html: html = re.sub(f'(?\1', sanitized) sanitized = sanitized.replace('‎','').replace('​','').replace("\ufeff", "").replace("𒐪","") if alert: captured = [] for i in mention_regex2.finditer(sanitized): if i.group(0) in captured: continue captured.append(i.group(0)) u = get_user(i.group(1), graceful=True) if u: sanitized = sanitized.replace(i.group(0), f'''

@{u.username}''') else: sanitized = reddit_regex.sub(r'\1/\2', sanitized) sanitized = sub_regex.sub(r'\1/\2', sanitized) captured = [] for i in mention_regex.finditer(sanitized): if i.group(0) in captured: continue captured.append(i.group(0)) u = get_user(i.group(2), graceful=True) if u and (not (g.v and g.v.any_block_exists(u)) or g.v.admin_level > 1): sanitized = sanitized.replace(i.group(0), f'''{i.group(1)}@{u.username}''') sanitized = imgur_regex.sub(r'\1_d.webp?maxwidth=9999&fidelity=high', sanitized) soup = BeautifulSoup(sanitized, 'lxml') for tag in soup.find_all("img"): if tag.get("src") and not tag["src"].startswith('/pp/'): tag["loading"] = "lazy" tag["data-src"] = tag["src"] tag["src"] = "/assets/images/loading.webp" tag['alt'] = f'![]({tag["data-src"]})' tag['referrerpolicy'] = "no-referrer" for tag in soup.find_all("a"): if tag.get("href") and fishylinks_regex.fullmatch(str(tag.string)): tag.string = tag["href"] sanitized = str(soup) sanitized = spoiler_regex.sub(r'\1', sanitized) marseys_used = set() emojis = list(emoji_regex.finditer(sanitized)) if len(emojis) > 20: edit = True captured = [] for i in emojis: if i.group(0) in captured: continue captured.append(i.group(0)) old = i.group(0) if 'marseylong1' in old or 'marseylong2' in old or 'marseyllama1' in old or 'marseyllama2' in old: new = old.lower().replace(">", " class='mb-0'>") else: new = old.lower() new = render_emoji(new, emoji_regex2, edit, marseys_used, True) sanitized = sanitized.replace(old, new) emojis = list(emoji_regex3.finditer(sanitized)) if len(emojis) > 20: edit = True sanitized = render_emoji(sanitized, emoji_regex3, edit, marseys_used) for rd in ["://reddit.com", "://new.reddit.com", "://www.reddit.com", "://redd.it", "://libredd.it", "://teddit.net"]: sanitized = sanitized.replace(rd, "://old.reddit.com") sanitized = sanitized.replace("nitter.net", "twitter.com").replace("old.reddit.com/gallery", "reddit.com/gallery").replace("https://youtu.be/", "https://youtube.com/watch?v=").replace("https://music.youtube.com/watch?v=", "https://youtube.com/watch?v=").replace("https://streamable.com/", "https://streamable.com/e/").replace("https://youtube.com/shorts/", "https://youtube.com/watch?v=").replace("https://mobile.twitter", "https://twitter").replace("https://m.facebook", "https://facebook").replace("m.wikipedia.org", "wikipedia.org").replace("https://m.youtube", "https://youtube").replace("https://www.youtube", "https://youtube").replace("https://www.twitter", "https://twitter").replace("https://www.instagram", "https://instagram").replace("https://www.tiktok", "https://tiktok") if "https://youtube.com/watch?v=" in sanitized: sanitized = sanitized.replace("?t=", "&t=") captured = [] for i in youtube_regex.finditer(sanitized): if i.group(0) in captured: continue captured.append(i.group(0)) params = parse_qs(urlparse(i.group(2).replace('&','&')).query) t = params.get('t', params.get('start', [0]))[0] if isinstance(t, str): t = t.replace('s','') htmlsource = f'{i.group(1)}' sanitized = sanitized.replace(i.group(0), htmlsource) sanitized = video_sub_regex.sub(r'\1', sanitized) if comment: for marsey in g.db.query(Marsey).filter(Marsey.name.in_(marseys_used)).all(): marsey.count += 1 g.db.add(marsey) if '#fortune' in sanitized: sanitized = sanitized.replace('#fortune', '') sanitized += '\n\n

' + choice(FORTUNE_REPLIES) + '

' sanitized = sanitized.replace('&','&') sanitized = utm_regex.sub('', sanitized) sanitized = utm_regex2.sub('', sanitized) sanitized = sanitized.replace('','').replace('','') sanitized = bleach.Cleaner(tags=allowed_tags, attributes=allowed_attributes, protocols=['http', 'https'], styles=['color', 'background-color', 'font-weight', 'text-align'], filters=[partial(LinkifyFilter, skip_tags=["pre"], parse_email=False, callbacks=[callback], url_re=url_re)] ).clean(sanitized) soup = BeautifulSoup(sanitized, 'lxml') links = soup.find_all("a") domain_list = set() for link in links: href = link.get("href") if not href: continue url = urlparse(href) domain = url.netloc url_path = url.path domain_list.add(domain+url_path) parts = domain.split(".") for i in range(len(parts)): new_domain = parts[i] for j in range(i + 1, len(parts)): new_domain += "." + parts[j] domain_list.add(new_domain) bans = g.db.query(BannedDomain.domain).filter(BannedDomain.domain.in_(list(domain_list))).all() if bans: abort(403, description=f"Remove the banned domains {bans} and try again!") signal.alarm(0) return sanitized def allowed_attributes_emojis(tag, name, value): if tag == 'img': if name == 'loading' and value == 'lazy': return True if name == 'data-bs-toggle' and value == 'tooltip': return True if name in ['src','alt','title','g']: return True return False def filter_emojis_only(title, edit=False, graceful=False): signal.signal(signal.SIGALRM, handler) signal.alarm(1) title = title.replace('‎','').replace('​','').replace("\ufeff", "").replace("𒐪","").replace("\n", "").replace("\r", "").replace("\t", "").replace("&", "&").replace('<','<').replace('>','>').replace('"', '"').replace("'", "'").strip() title = render_emoji(title, emoji_regex4, edit) title = strikethrough_regex.sub(r'\1', title) sanitized = bleach.clean(title, tags=['img','del'], attributes=allowed_attributes_emojis, protocols=['http','https']) signal.alarm(0) if len(title) > 1500 and not graceful: abort(400) else: return title