Optimize comment pagination in post_id, viewmore

Against a clean seeded DB, reduces `GET /post/1/` from 63 queries to 26 by removing redundancies and slow lazy-loaded queries during top comment pagination. Also applies eager loading to /viewmore/ with the expected reduction from 5*(N comments) queries to ~12/request. For testing locally, use a newly seeded DB to ensure Comment.descendant_count is populated. Ref: #485
2023-02-07 14:05:16 -05:00 · 2023-02-07 14:05:16 -05:00 · 007f0a3f02
commit 007f0a3f02
parent 946ee6291d
4 changed files with 94 additions and 172 deletions
--- a/files/commands/seed_db.py
+++ b/files/commands/seed_db.py
@ -4,6 +4,7 @@ import sqlalchemy
 from werkzeug.security import generate_password_hash
 from files.__main__ import app
 from files.classes import User, Submission, Comment, Vote, CommentVote
+from files.helpers.comments import bulk_recompute_descendant_counts
 from flask_sqlalchemy import SQLAlchemy

 db = SQLAlchemy(app)
@ -262,5 +263,8 @@ def seed_db():
 		comment.realupvotes = comment.upvotes - comment.downvotes
 		db.session.add(comment)

+	print("Computing comment descendant_count")
+	bulk_recompute_descendant_counts(db=db.session)
+
 	db.session.commit()
 	db.session.flush()
--- a/files/helpers/comments.py
+++ b/files/helpers/comments.py
@ -1,15 +1,15 @@
 from pusher_push_notifications import PushNotifications
-from files.classes import Comment, Notification, Subscription
+from files.classes import Comment, Notification, Subscription, User
 from files.helpers.alerts import NOTIFY_USERS
 from files.helpers.const import PUSHER_ID, PUSHER_KEY, SITE_ID, SITE_FULL
 from files.helpers.assetcache import assetcache_path
 from flask import g
 from sqlalchemy import select, update
 from sqlalchemy.sql.expression import func, text, alias
-from sqlalchemy.orm import aliased
+from sqlalchemy.orm import Query, aliased
 from sys import stdout
 import gevent
-import typing
+from typing import Optional

 if PUSHER_ID != 'blahblahblah':
 	beams_client = PushNotifications(instance_id=PUSHER_ID, secret_key=PUSHER_KEY)
@ -217,3 +217,16 @@ def comment_on_unpublish(comment:Comment):
 	reflect the comments users will actually see.
 	"""
 	update_stateful_counters(comment, -1)
+
+
+def comment_filter_moderated(q: Query, v: Optional[User]) -> Query:
+	if not (v and v.shadowbanned) and not (v and v.admin_level > 2):
+		q = q.join(User, User.id == Comment.author_id) \
+		     .filter(User.shadowbanned == None)
+	if not v or v.admin_level < 2:
+		q = q.filter(
+			((Comment.filter_state != 'filtered')
+				& (Comment.filter_state != 'removed'))
+			| (Comment.author_id == ((v and v.id) or 0))
+		)
+	return q
--- a/files/helpers/get.py
+++ b/files/helpers/get.py
@ -1,9 +1,9 @@
 from collections import defaultdict
-from typing import Iterable, List, Optional, Type, Union
+from typing import Callable, Iterable, List, Optional, Type, Union

 from flask import g
 from sqlalchemy import and_, or_, func
-from sqlalchemy.orm import selectinload
+from sqlalchemy.orm import Query, selectinload

 from files.classes import *
 from files.helpers.const import AUTOJANNY_ID
@ -277,7 +277,7 @@ def get_comments(
 # TODO: There is probably some way to unify this with get_comments. However, in
 #       the interim, it's a hot path and benefits from having tailored code.
 def get_comment_trees_eager(
-		top_comment_ids:Iterable[int],
+		query_filter_callable: Callable[[Query], Query],
 		sort: str="old",
 		v: Optional[User]=None) -> List[Comment]:

@ -305,7 +305,7 @@ def get_comment_trees_eager(
 	else:
 		query = g.db.query(Comment)

-	query = query.filter(Comment.top_comment_id.in_(top_comment_ids))
+	query = query_filter_callable(query)
 	query = query.options(
 		selectinload(Comment.author).options(
 			selectinload(User.badges),
@ -335,13 +335,12 @@ def get_comment_trees_eager(
 		comments_map_parent[c.parent_comment_id].append(c)

 	for parent_id in comments_map_parent:
-		if parent_id is None: continue
-
 		comments_map_parent[parent_id] = sort_comment_results(
 			comments_map_parent[parent_id], sort)
+		if parent_id in comments_map:
 			comments_map[parent_id].replies2 = comments_map_parent[parent_id]

-	return [comments_map[tcid] for tcid in top_comment_ids]
+	return comments, comments_map_parent


 # TODO: This function was concisely inlined into posts.py in upstream.
--- a/files/routes/posts.py
+++ b/files/routes/posts.py
@ -3,6 +3,7 @@ import gevent
 from files.helpers.wrappers import *
 from files.helpers.sanitize import *
 from files.helpers.alerts import *
+from files.helpers.comments import comment_filter_moderated
 from files.helpers.contentsorting import sort_objects
 from files.helpers.const import *
 from files.helpers.strings import sql_ilike_clean
@ -17,6 +18,7 @@ from os import path
 import requests
 from shutil import copyfile
 from sys import stdout
+from sqlalchemy.orm import Query


 snappyquotes = [f':#{x}:' for x in marseys_const2]
@ -139,93 +141,33 @@ def post_id(pid, anything=None, v=None, sub=None):

 	if post.club and not (v and (v.paid_dues or v.id == post.author_id)): abort(403)

-	if v:
-		votes = g.db.query(CommentVote).filter_by(user_id=v.id).subquery()
-
-		blocking = v.blocking.subquery()
-
-		blocked = v.blocked.subquery()
-
-		comments = g.db.query(
-			Comment,
-			votes.c.vote_type,
-			blocking.c.target_id,
-			blocked.c.target_id,
-		)
-		
-		if not (v and v.shadowbanned) and not (v and v.admin_level > 2):
-			comments = comments.join(User, User.id == Comment.author_id).filter(User.shadowbanned == None)
-
-		if v.admin_level < 2:
-			filter_clause = ((Comment.filter_state != 'filtered') & (Comment.filter_state != 'removed')) | (Comment.author_id == v.id)
-			comments = comments.filter(filter_clause)
-
-		comments=comments.filter(Comment.parent_submission == post.id).join(
-			votes,
-			votes.c.comment_id == Comment.id,
-			isouter=True
-		).join(
-			blocking,
-			blocking.c.target_id == Comment.author_id,
-			isouter=True
-		).join(
-			blocked,
-			blocked.c.user_id == Comment.author_id,
-			isouter=True
-		)
-
-		output = []
-		for c in comments.all():
-			comment = c[0]
-			comment.voted = c[1] or 0
-			comment.is_blocking = c[2] or 0
-			comment.is_blocked = c[3] or 0
-			output.append(comment)
-		
-		pinned = [c[0] for c in comments.filter(Comment.is_pinned != None).all()]
-		
-		comments = comments.filter(Comment.level == 1, Comment.is_pinned == None)
-		comments = sort_objects(comments, sort, Comment)
-		comments = [c[0] for c in comments.all()]
-	else:
-		pinned = g.db.query(Comment).filter(Comment.parent_submission == post.id, Comment.is_pinned != None).all()
-
-		comments = g.db.query(Comment).join(User, User.id == Comment.author_id).filter(User.shadowbanned == None, Comment.parent_submission == post.id, Comment.level == 1, Comment.is_pinned == None)
-		comments = sort_objects(comments, sort, Comment)
-
-		filter_clause = (Comment.filter_state != 'filtered') & (Comment.filter_state != 'removed')
-		comments = comments.filter(filter_clause)
-
-		comments = comments.all()
-
-	offset = 0
-	ids = set()
-
 	limit = app.config['RESULTS_PER_PAGE_COMMENTS']
+	offset = 0

-	if post.comment_count > limit and not request.headers.get("Authorization") and not request.values.get("all"):
-		comments2 = []
-		count = 0
-		for comment in comments:
-			comments2.append(comment)
-			ids.add(comment.id)
-			count += g.db.query(Comment.id).filter_by(parent_submission=post.id, top_comment_id=comment.id).count() + 1
-			if count > limit: break
+	top_comments = g.db.query(Comment.id, Comment.descendant_count).filter(
+		Comment.parent_submission == post.id,
+		Comment.level == 1,
+	).order_by(Comment.is_pinned.desc().nulls_last())
+	top_comments = comment_filter_moderated(top_comments, v)
+	top_comments = sort_objects(top_comments, sort, Comment)

-		if len(comments) == len(comments2): offset = 0
-		else: offset = 1
-		comments = comments2
+	pg_top_comment_ids = []
+	pg_comment_qty = 0
+	for tc_id, tc_children_qty in top_comments.all():
+		if pg_comment_qty >= limit:
+			offset = 1
+			break
+		pg_comment_qty += tc_children_qty + 1
+		pg_top_comment_ids.append(tc_id)

-	for pin in pinned:
-		if pin.is_pinned_utc and int(time.time()) > pin.is_pinned_utc:
-			pin.is_pinned = None
-			pin.is_pinned_utc = None
-			g.db.add(pin)
-			pinned.remove(pin)
+	def comment_tree_filter(q: Query) -> Query:
+		q = q.filter(Comment.top_comment_id.in_(pg_top_comment_ids))
+		q = comment_filter_moderated(q, v)
+		return q

-	top_comments = pinned + comments
-	top_comment_ids = [c.id for c in top_comments]
-	post.replies = get_comment_trees_eager(top_comment_ids, sort, v)
+	comments, comment_tree = get_comment_trees_eager(comment_tree_filter, sort, v)
+	post.replies = comment_tree[None] # parent=None -> top-level comments
+	ids = {c.id for c in post.replies}

 	post.views += 1
 	g.db.expire_on_commit = False
@ -246,88 +188,52 @@ def viewmore(v, pid, sort, offset):
 	post = get_post(pid, v=v)
 	if post.club and not (v and (v.paid_dues or v.id == post.author_id)): abort(403)

-	offset = int(offset)
+	offset_prev = int(offset)
 	try: ids = set(int(x) for x in request.values.get("ids").split(','))
 	except: abort(400)
 	
-	if sort == "new":
-		newest = g.db.query(Comment).filter(Comment.id.in_(ids)).order_by(Comment.created_utc.desc()).first()
-
-	if v:
-		votes = g.db.query(CommentVote).filter_by(user_id=v.id).subquery()
-
-		blocking = v.blocking.subquery()
-
-		blocked = v.blocked.subquery()
-
-		comments = g.db.query(
-			Comment,
-			votes.c.vote_type,
-			blocking.c.target_id,
-			blocked.c.target_id,
-		).filter(Comment.parent_submission == pid, Comment.is_pinned == None, Comment.id.notin_(ids))
-
-		if not (v and v.shadowbanned) and not (v and v.admin_level > 2):
-			comments = comments.join(User, User.id == Comment.author_id).filter(User.shadowbanned == None)
-
-		if not v or v.admin_level < 2:
-			filter_clause = (Comment.filter_state != 'filtered') & (Comment.filter_state != 'removed')
-			if v:
-				filter_clause = filter_clause | (Comment.author_id == v.id)
-			comments = comments.filter(filter_clause)
-
-		comments=comments.join(
-			votes,
-			votes.c.comment_id == Comment.id,
-			isouter=True
-		).join(
-			blocking,
-			blocking.c.target_id == Comment.author_id,
-			isouter=True
-		).join(
-			blocked,
-			blocked.c.user_id == Comment.author_id,
-			isouter=True
-		)
-
-		output = []
-		for c in comments.all():
-			comment = c[0]
-			comment.voted = c[1] or 0
-			comment.is_blocking = c[2] or 0
-			comment.is_blocked = c[3] or 0
-			output.append(comment)
-		
-		comments = comments.filter(Comment.level == 1)
-
-		if sort == "new":
-			comments = comments.filter(Comment.created_utc < newest.created_utc)
-		comments = sort_objects(comments, sort, Comment)
-
-		comments = [c[0] for c in comments.all()]
-	else:
-		comments = g.db.query(Comment).join(User, User.id == Comment.author_id).filter(User.shadowbanned == None, Comment.parent_submission == pid, Comment.level == 1, Comment.is_pinned == None, Comment.id.notin_(ids))
-
-		if sort == "new":
-			comments = comments.filter(Comment.created_utc < newest.created_utc)
-		comments = sort_objects(comments, sort, Comment)
-
-		comments = comments.all()
-		comments = comments[offset:]
-
 	limit = app.config['RESULTS_PER_PAGE_COMMENTS']
-	comments2 = []
-	count = 0
+	offset = 0

-	for comment in comments:
-		comments2.append(comment)
-		ids.add(comment.id)
-		count += g.db.query(Comment.id).filter_by(parent_submission=post.id, top_comment_id=comment.id).count() + 1
-		if count > limit: break
+	# TODO: Unify with common post_id logic
+	top_comments = g.db.query(Comment.id, Comment.descendant_count).filter(
+		Comment.parent_submission == post.id,
+		Comment.level == 1,
+		Comment.id.notin_(ids),
+		Comment.is_pinned == None,
+	).order_by(Comment.is_pinned.desc().nulls_last())

-	if len(comments) == len(comments2): offset = 0
-	else: offset += 1
-	comments = comments2
+	if sort == "new":
+		newest_created_utc = g.db.query(Comment.created_utc).filter(
+			Comment.id.in_(ids),
+			Comment.is_pinned == None,
+		).order_by(Comment.created_utc.desc()).limit(1).scalar()
+
+		# Needs to be <=, not just <, to support seed_db data which has many identical
+		# created_utc values. Shouldn't cause duplication in real data because of the
+		# `NOT IN :ids` in top_comments.
+		top_comments = top_comments.filter(Comment.created_utc <= newest_created_utc)
+
+	top_comments = comment_filter_moderated(top_comments, v)
+	top_comments = sort_objects(top_comments, sort, Comment)
+
+	pg_top_comment_ids = []
+	pg_comment_qty = 0
+	for tc_id, tc_children_qty in top_comments.all():
+		if pg_comment_qty >= limit:
+			offset = offset_prev + 1
+			break
+		pg_comment_qty += tc_children_qty + 1
+		pg_top_comment_ids.append(tc_id)
+
+	def comment_tree_filter(q: Query) -> Query:
+		q = q.filter(Comment.top_comment_id.in_(pg_top_comment_ids))
+		q = comment_filter_moderated(q, v)
+		return q
+
+	_, comment_tree = get_comment_trees_eager(comment_tree_filter, sort, v)
+	comments = comment_tree[None] # parent=None -> top-level comments
+	ids |= {c.id for c in comments}

 	return render_template("comments.html", v=v, comments=comments, p=post, ids=list(ids), render_replies=True, pid=pid, sort=sort, offset=offset, ajax=True)