rDrama/files/commands/volunteer_janitor_recalc.py


import pprint

import sqlalchemy
from sqlalchemy.orm import Session

from alive_progress import alive_it
from collections import defaultdict
from files.classes import User, Comment, UserNote, UserTag
from files.classes.cron.tasks import TaskRunContext
from files.classes.volunteer_janitor import VolunteerJanitorRecord, VolunteerJanitorResult
from files.helpers.volunteer_janitor import evaluate_badness_of, userweight_from_user_accuracy, calculate_final_comment_badness, update_comment_badness
from files.helpers.math import saturate, remap, lerp

import logging
import random

from files.__main__ import app, db_session

CONFIG_modhat_weight = 4
CONFIG_admin_volunteer_weight = 4
CONFIG_new_user_damping = 2
CONFIG_default_user_accuracy = 0.2
CONFIG_user_correctness_lerp = 0.2

def _compile_records(db):
    vrecords = db.query(VolunteerJanitorRecord).order_by(VolunteerJanitorRecord.recorded_utc).all()

    # get the info we need for all mentioned posts
    reported_comment_ids = {record.comment_id for record in vrecords}
    reported_comments = db.query(Comment).where(Comment.id.in_(reported_comment_ids)).options(sqlalchemy.orm.load_only('id', 'state_user_deleted_utc'))
    reported_comments = {comment.id: comment for comment in reported_comments}

    # get our compiled data
    records_compiled = {}
    for record in vrecords:
        # we're just going to ignore deleted comments entirely
        if reported_comments[record.comment_id].state_user_deleted_utc != None:
            continue

        # unique identifier for user/comment report pair
        uic = (record.user_id, record.comment_id)

        if record.result == VolunteerJanitorResult.Pending:
            if uic in records_compiled:
                # something wonky happened, we went back to pending after getting a result?
                records_compiled[uic]["status"] = "wonky"
            else:
                # fill out the pending field
                records_compiled[uic] = {"status": "pending"}
        else:
            if not uic in records_compiled:
                # something wonky happened, we never asked them for the info to begin with
                records_compiled[uic] = {"status": "wonky"}
            elif records_compiled[uic]["status"] != "pending":
                # received two submissions; practically we'll just use their first submission
                records_compiled[uic]["status"] = "resubmit"
            else:
                # actually got a result, yay
                records_compiled[uic]["status"] = "submit"
                records_compiled[uic]["result"] = record.result

    # todo:
    # filter out anything submitted *after* a mod chimed in
    # filter out anything submitted too long after the request

    users_compiled = defaultdict(lambda: {
                "pending": 0,
                "wonky": 0,
                "submit": 0,
                "resubmit": 0,
            })
    for key, result in records_compiled.items():
        #pprint.pprint(key)
        userid = key[0]

        users_compiled[key[0]][result["status"]] += 1

    #pprint.pprint(records_compiled)
    #pprint.pprint(users_compiled)

    # strip out invalid records
    random_removal = -1 # this is sometimes useful for testing that our algorithm is somewhat stable; removing a random half of all responses shouldn't drastically invert people's quality scores, for example
    records_compiled = {key: value for key, value in records_compiled.items() if "result" in value and random.random() > random_removal}

    return records_compiled, users_compiled

def dbg_commentdump(cid, records_compiled, users, user_accuracy):
    print(f"Dump for comment {cid}")

    from tabulate import tabulate

    dats = []
    for key, value in [(key, value) for key, value in records_compiled.items() if key[1] == cid]:
        uid = key[0]
        dats.append({
            "vote": value["result"],
            "username": users[uid]["username"],
            "accuracy": user_accuracy[uid],
        })
    print(tabulate(dats, headers = "keys"))

def dbg_userdump(uid, records_compiled, users, comment_calculated_badness_user):
    print(f"Dump for user {users[uid]['username']}")

    from tabulate import tabulate

    dats = []
    for key, value in [(key, value) for key, value in records_compiled.items() if key[0] == uid]:
        cid = key[1]
        bad, weight = evaluate_badness_of(value["result"])
        dats.append({
            "cid": cid,
            "vote": value["result"],
            "calculated": evaluate_badness_of(value["result"]),
            "badness": comment_calculated_badness_user[cid],
            "correctness": evaluate_correctness_single(bad, weight, comment_calculated_badness_user[cid]),
        })
    print(tabulate(dats, headers = "keys"))

# Calculates how correct a user is, based on whether they thought it was bad, how confident they were, and how bad we think it is
# Returns (IsCorrect, Confidence)
def evaluate_correctness_single(bad, user_weight, calculated_badness):
    # Boolean for whether this comment is bad
    calculated_badbool = calculated_badness > 0.5

    # Boolean for whether the user was correct
    correctness_result = (bad == calculated_badbool) and 1 or 0

    # "how confident are we that this is bad/notbad", range [0, 0.5]
    calculated_badness_confidence = abs(calculated_badness - 0.5)

    # "how much do we want this to influence the user's correctness"
    # there's a deadzone around not-confident where we just push it to 0 and don't make it relevant
    calculated_badness_weight = saturate(remap(calculated_badness_confidence, 0.1, 0.5, 0, 1))

    # see how correct we think the user is
    user_correctness = user_weight * calculated_badness_weight

    return correctness_result, user_correctness

def volunteer_janitor_recalc(db: Session, diagnostics: bool = False):
    logging.info("Starting full janitor recalculation")

    # Get our full list of data
    records_compiled, users_compiled = _compile_records(db)

    reporting_user_ids = {record[0] for record in records_compiled}
    reported_comment_ids = {record[1] for record in records_compiled}

    # Get some metadata for all reported comments
    comments = db.query(Comment) \
        .where(Comment.id.in_(reported_comment_ids)) \
        .options(sqlalchemy.orm.load_only('id', 'created_utc', 'author_id'))
    comments = {comment.id: comment for comment in comments}

    reported_user_ids = {comment.author_id for comment in comments.values()}

    # Get mod intervention data
    modhats_raw = db.query(Comment) \
        .where(Comment.parent_comment_id.in_(reported_comment_ids)) \
        .where(Comment.distinguish_level > 0) \
        .options(sqlalchemy.orm.load_only('parent_comment_id', 'created_utc'))

    modhats = {}
    # we jump through some hoops to deduplicate this; I guess we just pick the last one in our list for now
    for modhat in modhats_raw:
        modhats[modhat.parent_comment_id] = modhat

    usernotes_raw = db.query(UserNote) \
        .where(UserNote.tag.in_([UserTag.Warning, UserTag.Tempban, UserTag.Permban, UserTag.Spam, UserTag.Bot])) \
        .options(sqlalchemy.orm.load_only('reference_user', 'created_datetimez', 'tag'))

    # Here we're trying to figure out whether modhats are actually warnings/bans
    # We don't have a formal connection between "a comment is bad" and "the user got a warning", so we're kind of awkwardly trying to derive it from our database
    # In addition, sometimes someone posts a lot of bad comments and only gets modhatted for one of them
    # That doesn't mean the other comments weren't bad
    # It just means we picked the worst one
    # So we ignore comments near the actual modhat time

    commentresults = {}
    for uid in reported_user_ids:
        # For each user, figure out when modhats happened
        # this is slow but whatever
        modhat_times = []
        for modhat in modhats.values():
            if comments[modhat.parent_comment_id].author_id != uid:
                continue

            modhat_times.append(modhat.created_utc)

        usernote_times = []
        for usernote in usernotes_raw:
            if usernote.reference_user != uid:
                continue

            usernote_times.append(usernote.created_utc)

        # For each comment . . .
        for comment in comments.values():
            if comment.author_id != uid:
                continue

            if comment.id in modhats:
                modhat_comment = modhats[comment.id]
            else:
                modhat_comment = None

            # if the comment was modhatted *and* resulted in a negative usernote near the modhat time, it's bad
            if modhat_comment is not None and next((time for time in usernote_times if abs(modhat_comment.created_utc - time) < 60 * 15), None) is not None:
                commentresults[comment.id] = "bad"
            # otherwise, if the comment was posted less than 48 hours before a negative usernote, we ignore it for processing on the assumption that it may just have been part of a larger warning
            elif next((time for time in usernote_times if comment.created_utc < time and comment.created_utc + 48 * 60 * 60 > time), None) is not None:
                commentresults[comment.id] = "ignored"
            # otherwise, we call it not-bad
            else:
                commentresults[comment.id] = "notbad"

    # get per-user metadata
    users = db.query(User) \
        .where(User.id.in_(reporting_user_ids)) \
        .options(sqlalchemy.orm.load_only('id', 'username', 'admin_level'))
    users = {user.id: {"username": user.username, "admin": user.admin_level != 0} for user in users}

    user_accuracy = defaultdict(lambda: CONFIG_default_user_accuracy)

    # Do an update loop!
    for lid in range(0, 100):

        # Accumulated weight/badness, taking admin flags into account
        # This is used for training
        comment_weight_admin = defaultdict(lambda: 0)
        comment_badness_admin = defaultdict(lambda: 0)

        # Accumulated weight/badness, not taking admin flags into account
        # This is used for output and display
        comment_weight_user = defaultdict(lambda: 0)
        comment_badness_user = defaultdict(lambda: 0)

        # accumulate modhat weights
        for cid in reported_comment_ids:
            result = commentresults[cid]

            if result == "ignored":
                # I guess we'll just let the users decide?
                continue

            if result == "bad":
                comment_weight_admin[cid] += CONFIG_modhat_weight
                comment_badness_admin[cid] += CONFIG_modhat_weight

            if result == "notbad":
                comment_weight_admin[cid] += CONFIG_modhat_weight

        # accumulate volunteer weights
        for key, value in records_compiled.items():
            uid, cid = key

            # Calculate how much to weight a user; highly inaccurate users are not inverted! They just don't get contribution
            # (losers)
            userweight_user = userweight_from_user_accuracy(user_accuracy[uid]);

            if users[uid]["admin"]:
                userweight_admin = CONFIG_admin_volunteer_weight
            else:
                userweight_admin = userweight_user

            bad, weight = evaluate_badness_of(value["result"])

            # Accumulate these to our buffers
            comment_weight_admin[cid] += userweight_admin * weight
            comment_weight_user[cid] += userweight_user * weight

            if bad:
                comment_badness_admin[cid] += userweight_admin * weight
                comment_badness_user[cid] += userweight_user * weight

        # Calculated badnesses, both taking admins into account and not doing so, and "theoretical idea" versus a conversative view designed to be more skeptical of low-weighted comments
        comment_calculated_badness_admin = {cid: calculate_final_comment_badness(comment_badness_admin[cid], comment_weight_admin[cid], False) for cid in reported_comment_ids}
        comment_calculated_badness_admin_conservative = {cid: calculate_final_comment_badness(comment_badness_admin[cid], comment_weight_admin[cid], True) for cid in reported_comment_ids}
        comment_calculated_badness_user = {cid: calculate_final_comment_badness(comment_badness_user[cid], comment_weight_user[cid], False) for cid in reported_comment_ids}
        comment_calculated_badness_user_conservative = {cid: calculate_final_comment_badness(comment_badness_user[cid], comment_weight_user[cid], True) for cid in reported_comment_ids}

        # go through user submissions and count up how good users seem to be at this
        user_correctness_weight = defaultdict(lambda: CONFIG_new_user_damping)
        user_correctness_value = defaultdict(lambda: CONFIG_default_user_accuracy * user_correctness_weight[0])

        for key, value in records_compiled.items():
            uid, cid = key

            # if this is "ignored", I don't trust that we have a real answer, so we just skip it for training purposes
            if commentresults[cid] == "ignored":
                continue

            bad, weight = evaluate_badness_of(value["result"])

            correctness, weight = evaluate_correctness_single(bad, weight, comment_calculated_badness_admin[cid])

            user_correctness_weight[uid] += weight
            user_correctness_value[uid] += correctness * weight

        # calculate new correctnesses
        for uid in reporting_user_ids:
            target_user_correctness = user_correctness_value[uid] / user_correctness_weight[uid]

            # lerp slowly to the new values
            user_accuracy[uid] = lerp(user_accuracy[uid], target_user_correctness, CONFIG_user_correctness_lerp)

    if diagnostics:
        # debug print

        from tabulate import tabulate

        commentscores = [{
            "link": f"https://themotte.org/comment/{cid}",
            "badness": comment_calculated_badness_admin[cid],
            "badnessuser": comment_calculated_badness_user[cid],
            "badnessusercons": comment_calculated_badness_user_conservative[cid],
            "participation": comment_weight_user[cid],
            "mh": commentresults[cid]} for cid in reported_comment_ids]
        commentscores.sort(key = lambda item: item["badnessusercons"] + item["badnessuser"] / 100)
        print(tabulate(commentscores, headers = "keys"))

        results = [{
                "user": f"https://themotte.org/@{users[uid]['username']}",
                "accuracy": user_accuracy[uid],
                "submit": users_compiled[uid]["submit"],
                "nonsubmit": sum(users_compiled[uid].values()) - users_compiled[uid]["submit"],
                "admin": users[uid]["admin"] and "Admin" or "",
            } for uid in reporting_user_ids]
        results.sort(key = lambda k: k["accuracy"])
        print(tabulate(results, headers = "keys"))

        dbg_commentdump(89681, records_compiled, users, user_accuracy)
        print(calculate_final_comment_badness(comment_badness_user[89681], comment_weight_user[89681], True))

        #dbg_userdump(131, records_compiled, users, comment_calculated_badness_user)

    # Shove all this in the database, yaaay
    # Conditional needed because sqlalchemy breaks if you try passing it zero data
    if len(user_accuracy) > 0:
        db.query(User) \
            .where(User.id.in_([id for id in user_accuracy.keys()])) \
            .update({
                User.volunteer_janitor_correctness: sqlalchemy.sql.case(
                    user_accuracy,
                    value = User.id,
                )
            })
        db.commit()

    # We don't bother recalculating comment confidences here; it's a pain to do it and they shouldn't change much

    logging.info("Finished full janitor recalculation")

@app.cli.command('volunteer_janitor_recalc')
def volunteer_janitor_recalc_cmd():
    volunteer_janitor_recalc(db_session(), diagnostics = True)

def volunteer_janitor_recalc_cron(ctx:TaskRunContext):
    volunteer_janitor_recalc(ctx.db)

def volunteer_janitor_recalc_all_comments(db: Session):
    # may as well do this first
    volunteer_janitor_recalc(db)

    # I'm not sure of the details here, but there seems to be some session-related caching cruft left around
    # so let's just nuke that
    db.expire_all()

    # going through all the comments piecemeal like this is hilariously efficient, but this entire system gets run exactly once ever, so, okay
    for comment in alive_it(db.query(Comment).join(Comment.reports)):
        update_comment_badness(db, comment.id)

    db.commit()

@app.cli.command('volunteer_janitor_recalc_all_comments')
def volunteer_janitor_recalc_all_comments_cmd():
    volunteer_janitor_recalc_all_comments(db_session())