#!/usr/bin/bash
#-
# Copyright (c) 2025 Red Hat, Inc.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
# Written by Mikolaj Izdebski <mizdebsk@redhat.com>
# Written by Robert Foss <rfoss@redhat.com>
set -eu
shopt -s globstar nullglob

PROGNAME=rhel-drivers
VERSION="20251029-7.el10_1"

progname() { basename "$0"; }
die() {
    echo "${PROGNAME}: $*" >&2
    exit 1
}

quiet=0
verbose=0

info() {
    [ "$quiet" -eq 1 ] && return 0
    printf '%s\n' "$*" >&2
}

log() {
    [ "$verbose" -eq 1 ] || return 0
    printf '%s\n' "$*" >&2
}

usage() {
    cat <<EOF
${PROGNAME} - install hardware drivers

Usage:
  ${PROGNAME} [GLOBAL OPTIONS] <subcommand> [ARGS...]

Global options:
  --help            Show this help and exit
  --version         Show version and exit
  --verbose         Increase verbosity
  --quiet           Suppress non-error output

Subcommands:
  install           Install drivers
  remove            Remove drivers
  list              List drivers

Run "${PROGNAME} <subcommand> --help" for subcommand help.

EOF
}

help_install() {
    cat <<EOF
${PROGNAME} install - install hardware drivers

Usage:
  ${PROGNAME} install [OPTIONS] [ARGS...]

Options:
  --auto-detect     Auto-detect drivers to install
  --dry-run         Show what would happen, don't change anything
  --force           Force install (ignore checks)
  --help            Show this help for 'install' and exit

Arguments:
  Zero or more driver identifiers.

EOF
}

help_remove() {
    cat <<EOF
${PROGNAME} remove - remove installed hardware drivers

Usage:
  ${PROGNAME} remove [OPTIONS] [ARGS...]

Options:
  --dry-run         Show what would happen, don't change anything
  --all             Remove all installed drivers
  --help            Show this help for 'remove' and exit

Arguments:
  Zero or more driver identifiers.

EOF
}

help_list() {
    cat <<EOF
${PROGNAME} list - list available or installed hardware drivers

Usage:
  ${PROGNAME} list [OPTIONS]

Options:
  --available       List available drivers (default)
  --installed       List installed drivers
  --help            Show this help for 'list' and exit

Arguments:
  None.

EOF
}

# -------- global option parsing (long options only) --------
while [ "$#" -gt 0 ]; do
    case "$1" in
    --help)
        usage
        exit 0
        ;;
    --version)
        echo "${PROGNAME} $VERSION"
        exit 0
        ;;
    --verbose)
        verbose=1
        shift
        ;;
    --quiet)
        quiet=1
        shift
        ;;
    --)
        shift
        break
        ;; # end of global options
    --*) die "Unknown global option: $1" ;;
    *) break ;; # first non-option = subcommand
    esac
done

[ "$#" -gt 0 ] || {
    usage
    die "No subcommand specified"
}

subcmd=$1
shift

# -------- functions --------
driver_avail() {
    driver_versions="$(dnf -q repoquery --qf 'amdgpu:latest\n' kmod-amdgpu | sort -u -V -r | sed -n /^amdgpu:/p)"
    driver_versions+=" $(dnf -q repoquery --qf 'nvidia:%{version}\n' nvidia-driver | sort -V -r | sed -n /^nvidia:/p)"
    log "Available driver versions:
    $driver_versions"
    echo "$(echo $driver_versions | tr ' ' '\n')"
}

verify_repos() {
    log "[verify_repos] Checking repository status"

    if ! type -P subscription-manager >/dev/null 2>&1; then
        info "Warning: Subscription Manager is absent."
        info "You may need to enable appropriate repositories yourself."
        return 0
    fi

    # Get RHEL version
    version=$(awk -F'=' '/VERSION_ID/{ gsub(/"/,""); print $2=int($2)}' /etc/os-release)

    # Define required repositories
    supplementary_repo="rhel-${version}-for-$(arch)-supplementary-rpms"
    extensions_repo="rhel-${version}-for-$(arch)-extensions-rpms"

    log "[verify_repos] Required repositories: $supplementary_repo, $extensions_repo"

    # Get list of enabled repositories
    supplementary_enabled=0
    extensions_enabled=0
    if [ -f /etc/yum.repos.d/redhat.repo ]; then
        supplementary_enabled=$(sed -n '/'"${supplementary_repo}"'/,$p' /etc/yum.repos.d/redhat.repo | grep enabled | head -1 | cut -d= -f2 | cut -d' ' -f2)
        extensions_enabled=$(sed -n '/'"${extensions_repo}"'/,$p' /etc/yum.repos.d/redhat.repo | grep enabled | head -1 | cut -d= -f2 | cut -d' ' -f2)
    fi

    # Check and enable supplementary repository
    repo_cmd="subscription-manager repos"
    if [ "$supplementary_enabled" = "1" ]; then
        log "[verify_repos] Repository already enabled: $supplementary_repo"
    else
        echo "Enabling repository: ${supplementary_repo}"
        repo_cmd="$repo_cmd --enable=\"$supplementary_repo\""
    fi

    # Check and enable extensions repository
    if [ "$extensions_enabled" = "1" ]; then
        log "[verify_repos] Repository already enabled: $extensions_repo"
    else
        echo "Enabling repository: ${extensions_repo}"
        repo_cmd="$repo_cmd --enable=\"$extensions_repo\""
    fi

    if [ ! "$supplementary_enabled" = "1" ] || [ ! "$extensions_enabled" = "1" ]; then
        eval $repo_cmd || die "Failed to enable repositories"
    fi

    log "[verify_repos] Repository verification complete"
}

autodetect() {
    drivers_found="$(autodetect_nvidia || true)"
    log "[autodetect] Found the following hardware: $drivers_found"
    [ -n "$drivers_found" ] || return 1
    echo "$drivers_found"
}

autodetect_nvidia() {
    pci_class_display="03"
    nvidia_vendor="10de"
    supported_gpus="/usr/share/rhel-drivers/nvidia/supported-gpus.json"
    # Allow overriding /sys/devices for testing
    modalias_path="${__rhel_drivers_modalias_path:-/sys/devices}"

    # Extract key-value pairs and store them in an associative array
    if [ ! -e $supported_gpus ]; then
        die "Can't find $supported_gpus"
    fi
    declare -A gpus
    devid_name_mapping=$(jq -r '.chips.[] | select(.features | index("kernelopen")) | "gpus[\(.devid | sub("^0x"; "") | ascii_downcase)]=\"\(.name)\";"' $supported_gpus)
    eval "$devid_name_mapping"

    for modalias_file in "$modalias_path"/**/modalias; do
        regex_pattern=".+:v(.+)d(.+)sv(.+)sd(.+)bc(.+)sc(.+)i(.*)"
        modalias=$(cat "$modalias_file" | tr '[:upper:]' '[:lower:]')
        if [[ "$modalias" =~ $regex_pattern ]]; then
            vendor="${BASH_REMATCH[1]: -4}"
            device="${BASH_REMATCH[2]: -4}"
            subvendor="${BASH_REMATCH[3]: -4}"
            subdevice="${BASH_REMATCH[4]: -4}"
            baseclass="${BASH_REMATCH[5]}"
            subclass="${BASH_REMATCH[6]}"
            interface="${BASH_REMATCH[7]}"

            if ! [ "$baseclass" == "$pci_class_display" ]; then
                continue
            fi

            if ! [ "$vendor" == "$nvidia_vendor" ]; then
                continue
            fi

            if [ -v gpus[$device] ]; then
                log "Found GPU: ${gpus[${device}]}"
                echo "nvidia"
                return 0
            fi
        fi
    done
    return 1
}

nvidia_packages="cublasmp \
    cuda-compat \
    cuda-toolkit \
    cudnn \
    dnf-plugin-nvidia \
    libnccl-devel \
    libnccl-static \
    nvidia-fabricmanager \
    nvidia-fabric-manager-devel"

install_nvidia() {
    driver="$1"
    driver_versions="$(driver_avail)"
    latest="$(echo $driver_versions | tr " " "\n" | sort -r -V | head -n 1)"
    version="$latest"
    log "[install_nvidia] Latest driver version: $latest"

    case "$driver" in
    nvidia:*)
        echo "\$driver=$driver"
        if ! grep -Fxq "$driver" <<<"$driver_versions"; then
            die "No such driver: $driver"
        fi
        version="$driver"
        ;;
    esac
    version="${version#nvidia:}"
    nvidia_driver_pkg=$(dnf -q repoquery  nvidia-driver | grep $version)
    nvidia_driver_cuda_pkg=$(dnf -q repoquery  nvidia-driver-cuda | grep $version)
    dnf_cmd="dnf install $nvidia_driver_pkg $nvidia_driver_cuda_pkg $nvidia_packages"
    log "Installing NVIDIA + CUDA driver: $version"
    if [ $dry_run -eq 0 ]; then
        eval $dnf_cmd
    fi
}

remove_nvidia() {
    driver="$1"
    version="$driver"
    dnf_cmd="dnf remove nvidia-driver-$version nvidia-driver-cuda-$version $nvidia_packages"
    log "Remove: $dnf_cmd"
    if [ $dry_run -eq 0 ]; then
        eval $dnf_cmd
    fi
}

install_amd() {
    dnf_cmd="dnf install kmod-amdgpu"
    log "Installing AMD GPU driver"
    if [ $dry_run -eq 0 ]; then
        eval $dnf_cmd
    fi
}

remove_amd() {
    dnf_cmd="dnf remove kmod-amdgpu"
    log "Remove: $dnf_cmd"
    if [ $dry_run -eq 0 ]; then
        eval $dnf_cmd
    fi
}

# -------- subcommands --------
cmd_install() {
    auto_detect=0 dry_run=0 force=0

    # parse install options
    while [ "$#" -gt 0 ]; do
        case "$1" in
        --help)
            help_install
            exit 0
            ;;
        --auto-detect)
            auto_detect=1
            shift
            ;;
        --dry-run)
            dry_run=1
            shift
            ;;
        --force)
            force=1
            shift
            ;;
        --)
            shift
            break
            ;;
        --*) die "Unknown option for 'install': $1" ;;
        *) break ;;
        esac
    done

    log "[install] auto_detect=$auto_detect dry_run=$dry_run force=$force"
    log "[install] drivers: [$@]"

    drivers="$@"
    if [ "$#" -eq 0 ] && [ $auto_detect -eq 0 ]; then
        die "Not specified what to install"
    fi
    if [ "$#" -gt 0 ] && [ $auto_detect -ne 0 ]; then
        die "Both autodetect and something to install"
    fi
    if [ $auto_detect -ne 0 ]; then
        drivers=$(autodetect) || die "No compatible hardware found!"
    elif [ $force -eq 0 ]; then
        drivers_compatible="amdgpu "$(autodetect) || info "No compatible hardware found!"
        for driver_install in $@; do
            driver_install=$(echo $driver_install | cut -d':' -f1)
            if [[ ! "$drivers_compatible" == *"$driver_install"* ]]; then
                die "Driver not found: $driver_install"
            fi
        done
    fi

    # Verify and enable required repositories
    verify_repos

    log "[install] Installing the following drivers: $drivers"
    for driver in $drivers; do
        case "$driver" in
        nvidia*) install_nvidia $driver ;;
        amdgpu | amdgpu:latest) install_amd ;;
        *) die "Unknown driver: $driver" ;;
        esac
    done
}

cmd_remove() {
    dry_run=0 all=0

    while [ "$#" -gt 0 ]; do
        case "$1" in
        --help)
            help_remove
            exit 0
            ;;
        --dry-run)
            dry_run=1
            shift
            ;;
        --all)
            all=1
            shift
            ;;
        --)
            shift
            break
            ;;
        --*) die "Unknown option for 'remove': $1" ;;
        *) break ;;
        esac
    done

    if [ "$#" -eq 0 ] && [ $all -eq 0 ]; then
        die "Not specified what to remove"
    fi
    if [ "$#" -gt 0 ] && [ $all -ne 0 ]; then
        die "Both all and something to remove"
    fi
    if [ $all -ne 0 ]; then
        die "Removal of all drivers is not implemented yet"
    fi

    dnf_cmd="dnf remove"
    while [ "$#" -gt 0 ]; do
        case "$1" in
        nvidia:*)
            if ! grep -Fxq "$1" <<<"$driver_inst_nvidia"; then
                die "Driver not installed: $1"
            fi
            version="${1#nvidia:}"
            remove_nvidia $version
            ;;
        amdgpu | amdgpu:latest)
            remove_amd
            ;;
        *) die "Unknown driver: $1" ;;
        esac
        shift
    done
}

cmd_list() {
    opt_available=0 opt_installed=0

    while [ "$#" -gt 0 ]; do
        case "$1" in
        --help)
            help_list
            exit 0
            ;;
        --available)
            opt_available=1
            shift
            ;;
        --installed)
            opt_installed=1
            shift
            ;;
        --)
            shift
            break
            ;;
        --*) die "Unknown option for 'list': $1" ;;
        *) break ;;
        esac
    done

    [ "$#" -eq 0 ] || die "'list' does not take arguments"

    log "[list] available=$opt_available installed=$opt_installed"

    if [ "$opt_available" -eq 0 ] && [ "$opt_installed" -eq 0 ]; then
        opt_available=1
    fi

    if [ "$opt_available" -eq 1 ]; then
        # Verify and enable required repositories
        verify_repos

        driver_versions="$(driver_avail)"

        # Check which drivers have autodetected hardware support
        autodetected=$(autodetect 2>/dev/null || true)
        [ -n "$autodetected" ] && log "[list] Autodetected hardware: $autodetected"

        # Mark drivers: * = installed, > = autodetected hardware
        marked_avail=""
        while IFS= read -r driver; do
            [ -z "$driver" ] && continue

            log "[list] Comparing driver='$driver' with driver_inst_nvidia='$driver_inst_nvidia' and driver_inst_amd='$driver_inst_amd'"
            mark_installed=$([ "$driver" = "$driver_inst_nvidia" -o "$driver" = "$driver_inst_amd" ] && echo "*" || echo " ")
            mark_autodetect=$(echo "$autodetected" | grep -qw "${driver%%:*}" && echo ">" || echo " ")

            # Only show the latest or installed versions
            latest="$(echo $driver_versions | tr " " "\n" | sort -V -r | head -n 1)"
            if [[ $driver = "amdgpu:latest" ]] || [[ $driver =~ "$latest" ]] || [[ $mark_installed == "*" ]]; then
                marked_avail="${marked_avail}${mark_installed}${mark_autodetect} ${driver}"$'\n'
            fi
        done <<<"$driver_versions"

        info "Available drivers:
${marked_avail%$'\n\n\n'}"
    fi

    if [ "$opt_installed" -eq 1 ]; then
        info "Installed drivers:${driver_inst_amd:+
$driver_inst_amd}${driver_inst_nvidia:+
$driver_inst_nvidia}"
    fi
}

if driver_inst_nvidia=nvidia:$(rpm -q --qf '%{version}' nvidia-driver 2>/dev/null); then
    log "Currently installed NVIDIA driver version: $driver_inst_nvidia"
else
    driver_inst_nvidia=""
    log "NVIDIA driver is currently NOT installed"
fi

if rpm -q kmod-amdgpu >/dev/null 2>&1; then
    driver_inst_amd="amdgpu:latest"
    log "Currently installed AMD GPU driver"
else
    driver_inst_amd=""
    log "AMD GPU driver is currently NOT installed"
fi

# -------- dispatch --------
case "$subcmd" in
install | in) cmd_install "$@" ;;
remove | rm) cmd_remove "$@" ;;
list | ls) cmd_list "$@" ;;
--help | help)
    usage
    exit 0
    ;;
--version | version)
    echo "${PROGNAME} $VERSION"
    exit 0
    ;;
*)
    usage
    die "Unknown subcommand: $subcmd"
    ;;
esac
