#!/usr/lib64/linuxfabrik-monitoring-plugins/venv/bin/python
# -*- coding: utf-8; py-indent-offset: 4 -*-
#
# Author:  Linuxfabrik GmbH, Zurich, Switzerland
# Contact: info (at) linuxfabrik (dot) ch
#          https://www.linuxfabrik.ch/
# License: The Unlicense, see LICENSE file.

# https://github.com/Linuxfabrik/monitoring-plugins/blob/main/CONTRIBUTING.md

"""See the check's README for more details."""

import argparse
import re
import sys

import lib.args
import lib.base
import lib.db_mysql
import lib.txt
from lib.globals import STATE_CRIT, STATE_OK, STATE_UNKNOWN, STATE_WARN

__author__ = 'Linuxfabrik GmbH, Zurich/Switzerland'
__version__ = '2026051204'

DESCRIPTION = """Checks for in-flight MySQL/MariaDB queries that have been running longer
than `--warning` / `--critical` seconds. Unlike `mysql-slow-queries` (which trends the
historical ratio of finished slow queries), this plugin shows queries that are *currently*
executing right now, with their session ID, user, database, runtime and a truncated copy
of the statement so the admin can `KILL <id>` directly. Sleeping sessions and replication
threads are ignored. Logic taken from MySQLTuner `mysql_pfs()`, but the data
source is `information_schema.processlist` so the check works on every MySQL/MariaDB
release without requiring Performance Schema to be enabled. Without `PROCESS` privilege,
the monitoring user only sees its own sessions; grant `PROCESS` on `*.*` to see queries
across all sessions."""

DEFAULT_CRIT = '300'
DEFAULT_DEFAULTS_FILE = '/var/spool/icinga2/.my.cnf'
DEFAULT_DEFAULTS_GROUP = 'client'
DEFAULT_LENGTHY = False
DEFAULT_TIMEOUT = 3
DEFAULT_WARN = '30'

# information_schema.processlist.INFO has no documented length cap (the
# server returns the full statement). For a monitoring output we truncate
# to keep one finding line readable; --lengthy switches to the full
# statement when an admin is investigating in interactive mode.
STATEMENT_PREVIEW_CHARS = 120

# Replication / admin housekeeping commands that we never want to count as
# "long-running query" even when they sit for hours. They are part of the
# server's normal background activity.
IGNORED_COMMANDS = ('Sleep', 'Binlog Dump', 'Daemon')


def parse_args():
    """Parse command line arguments using argparse."""
    parser = argparse.ArgumentParser(description=DESCRIPTION)

    parser.add_argument(
        '-V',
        '--version',
        action='version',
        version=f'%(prog)s: v{__version__} by {__author__}',
    )

    parser.add_argument(
        '--always-ok',
        help=lib.args.help('--always-ok'),
        dest='ALWAYS_OK',
        action='store_true',
        default=False,
    )

    parser.add_argument(
        '-c',
        '--critical',
        help='Query runtime in seconds that triggers CRIT. '
        'Supports Nagios ranges. '
        'Example: `--critical=60`. '
        'Default: %(default)s',
        dest='CRIT',
        default=DEFAULT_CRIT,
    )

    parser.add_argument(
        '--defaults-file',
        help='MySQL/MariaDB cnf file to read user, host and password from. '
        'Example: `--defaults-file=/var/spool/icinga2/.my.cnf`. '
        'Default: %(default)s',
        dest='DEFAULTS_FILE',
        default=DEFAULT_DEFAULTS_FILE,
    )

    parser.add_argument(
        '--defaults-group',
        help=lib.args.help('--defaults-group') + ' Default: %(default)s',
        dest='DEFAULTS_GROUP',
        default=DEFAULT_DEFAULTS_GROUP,
    )

    parser.add_argument(
        '--ignore',
        help='Regex applied to the running statement. Matching queries are '
        'ignored (e.g. `--ignore=^mysqldump` to exclude backup runs). '
        'Repeat the flag for multiple patterns. '
        'Example: `--ignore=ANALYZE --ignore=^mysqldump`',
        dest='IGNORE',
        action='append',
        default=None,
    )

    parser.add_argument(
        '--lengthy',
        help=lib.args.help('--lengthy'),
        dest='LENGTHY',
        action='store_true',
        default=DEFAULT_LENGTHY,
    )

    parser.add_argument(
        '--timeout',
        help=lib.args.help('--timeout') + ' Default: %(default)s (seconds)',
        dest='TIMEOUT',
        type=int,
        default=DEFAULT_TIMEOUT,
    )

    parser.add_argument(
        '-w',
        '--warning',
        help='Query runtime in seconds that triggers WARN. '
        'Supports Nagios ranges. '
        'Example: `--warning=15`. '
        'Default: %(default)s',
        dest='WARN',
        default=DEFAULT_WARN,
    )

    args, _ = parser.parse_known_args()
    return args


def fetch_long_queries(conn):
    """Return all in-flight non-sleeping queries from
    `information_schema.processlist`, excluding the plugin's own
    session (otherwise the `SELECT FROM information_schema.processlist`
    that fetches this list would show up as an in-flight query in its
    own output). Sleeping sessions and replication threads (`Binlog
    Dump`, `Daemon`) are filtered server-side to keep the network
    payload small even on busy servers. Returns a list of dicts; empty
    list when nothing matches.
    """
    placeholders = ', '.join(f"'{c}'" for c in IGNORED_COMMANDS)
    sql = f"""
        select ID as id,
            USER as user,
            HOST as host,
            DB as db,
            COMMAND as command,
            TIME as runtime_s,
            STATE as state,
            INFO as statement
        from information_schema.processlist
        where COMMAND not in ({placeholders})
            and INFO is not null
            and ID != CONNECTION_ID()
        order by TIME desc
        ;
    """  # nosec B608
    return lib.base.coe(lib.db_mysql.select(conn, sql))


def fetch_active_transactions(conn):
    """Return the number of active InnoDB transactions. Counts every
    row in `information_schema.innodb_trx`, which includes every
    transaction the InnoDB engine is currently tracking - both running
    and locked. Reported as perfdata only, so a Grafana panel can
    trend background contention even when no single query is in the
    long-running band yet. The `innodb_trx` view requires PROCESS,
    which the plugin already enforces upstream.
    """
    sql = 'select count(*) as cnt from information_schema.innodb_trx'
    success, rows = lib.db_mysql.select(conn, sql)
    if not success or not rows:
        return 0
    try:
        return int(rows[0].get('cnt') or 0)
    except (TypeError, ValueError):
        return 0


def apply_ignore_filter(rows, ignore_regexes):
    """Drop rows whose `statement` matches any of the user-supplied
    `--ignore` regexes. The match is case-insensitive and uses
    `re.search`, so partial matches anywhere in the statement count.
    """
    if not ignore_regexes:
        return rows
    compiled = [re.compile(p, re.IGNORECASE) for p in ignore_regexes]
    return [
        r for r in rows
        if not any(p.search(r.get('statement') or '') for p in compiled)
    ]


def truncate_statement(stmt, lengthy):
    """Render the statement for the output table. The full text in
    `--lengthy` mode, an ellipsised preview otherwise.
    """
    if stmt is None:
        return ''
    # Collapse runs of whitespace so a multi-line statement fits one row.
    flat = ' '.join(stmt.split())
    if lengthy or len(flat) <= STATEMENT_PREVIEW_CHARS:
        return flat
    return flat[: STATEMENT_PREVIEW_CHARS - 3] + '...'


def main():
    """The main function. This is where the magic happens."""

    # logic taken from mysqltuner.pl:mysql_pfs(), the
    # `Performance schema: Process per time` block. mysqltuner uses
    # `sys.x$processlist`; we query `information_schema.processlist`
    # instead so the check works without enabling PFS and on every
    # MySQL/MariaDB release.

    # parse the command line
    try:
        args = parse_args()
    except SystemExit:
        sys.exit(STATE_UNKNOWN)

    if args.IGNORE is None:
        args.IGNORE = []

    # fetch data
    mysql_connection = {
        'defaults_file': args.DEFAULTS_FILE,
        'defaults_group': args.DEFAULTS_GROUP,
        'timeout': args.TIMEOUT,
    }
    conn = lib.base.coe(lib.db_mysql.connect(mysql_connection))
    # PROCESS is required so the user sees other sessions' queries in
    # information_schema.processlist; without it the plugin would only
    # see its own connection and silently miss long-running queries from
    # application users (a false "all clear" is worse than UNKNOWN).
    lib.base.coe(lib.db_mysql.check_privileges(conn, 'PROCESS'))

    rows = fetch_long_queries(conn)
    rows = apply_ignore_filter(rows, args.IGNORE)
    active_transactions = fetch_active_transactions(conn)

    lib.db_mysql.close(conn)

    # init some vars
    state = STATE_OK
    sections = []
    perfdata = ''
    # Track the worst per-query state and the longest runtime for the
    # summary line and perfdata.
    max_runtime = 0
    warn_count = 0
    crit_count = 0
    table_data = []

    # analyze data
    for row in rows:
        runtime = int(row.get('runtime_s') or 0)
        if runtime > max_runtime:
            max_runtime = runtime
        item_state = lib.base.get_state(
            runtime, args.WARN, args.CRIT, _operator='range',
        )
        state = lib.base.get_worst(state, item_state)
        if item_state == STATE_CRIT:
            crit_count += 1
        elif item_state == STATE_WARN:
            warn_count += 1
        table_data.append({
            'id': row.get('id') or '',
            'user': row.get('user') or '',
            'db': row.get('db') or '',
            'runtime': f'{runtime}s',
            'statement': truncate_statement(row.get('statement'), args.LENGTHY),
            'state': lib.base.state2str(item_state),
        })

    # build the message
    n_total = len(rows)
    flagged = warn_count + crit_count
    summary = (
        f'{flagged} {lib.txt.pluralize("quer", flagged, suffix="y,ies")} '
        f'over threshold '
        f'(of {n_total} in-flight {lib.txt.pluralize("quer", n_total, suffix="y,ies")}), '
        f'longest {max_runtime}s'
    )
    if state == STATE_OK:
        sections.append('Everything is ok. ' + summary + '.')
    else:
        sections.append(summary + '.')

    if table_data:
        headers = ['ID', 'User', 'DB', 'Runtime', 'Statement', 'State']
        keys = ['id', 'user', 'db', 'runtime', 'statement', 'state']
        sections.append(lib.base.get_table(table_data, keys, header=headers))

    msg = '\n\n'.join(sections)

    perfdata += lib.base.get_perfdata(
        'mysql_long_queries_total',
        n_total,
        _min=0,
    )
    perfdata += lib.base.get_perfdata(
        'mysql_long_queries_over_warn',
        warn_count + crit_count,
        warn='0',
        _min=0,
    )
    perfdata += lib.base.get_perfdata(
        'mysql_long_queries_max_runtime_seconds',
        max_runtime,
        uom='s',
        warn=args.WARN,
        crit=args.CRIT,
        _min=0,
    )
    perfdata += lib.base.get_perfdata(
        'mysql_active_transactions',
        active_transactions,
        _min=0,
    )

    # over and out
    lib.base.oao(msg, state, perfdata, always_ok=args.ALWAYS_OK)


if __name__ == '__main__':
    try:
        main()
    except Exception:
        lib.base.cu()
