#!/usr/bin/env bash

set -o errexit
set -o pipefail
set -o nounset

# Check dependencies
if ! command -v ollama &> /dev/null; then
    echo "Error: 'ollama' is not installed or not in PATH."
    exit 1
fi

if [[ "$#" -ne 1 && "$#" -ne 2 ]]; then
    echo "Usage: $0 <image> [target_language]"
    echo "Example: $0 screenshot.png de"
    exit 1
fi

IMAGE_FILE=$(realpath "$1")

if [[ ! -f "$IMAGE_FILE" ]]; then
    echo "Error: File '$IMAGE_FILE' not found."
    exit 1
fi

# Default configuration
# pass variables to override parameters, e.g.: WORDS=200 MODE=extended describe-image.sh image.jpg
MODEL=${MODEL:-qwen3-vl:8b}
MODE=${MODE:-simple}
WORDS=${WORDS:-100}
TRANSLATION_MODEL=${TRANSLATION_MODEL:-llama3.2}

if ! [[ "$WORDS" =~ ^[0-9]+$ ]]; then
    echo "Error: WORDS must be numeric." >&2
    exit 1
fi

if [[ "$MODE" != "simple" && "$MODE" != "extended" ]]; then
    echo "Error: MODE must be either 'simple' or 'extended'." >&2
    exit 1
fi

PROMPT_SIMPLE="describe image ${IMAGE_FILE}"
PROMPT_EXTENDED=$(cat <<EOT
Please provide a functional, objective description of the provided image in no more than around ${WORDS} so that someone who could not see it would be able to imagine it. If possible, follow an “object-action-context” framework. The object is the main focus. The action describes what’s happening, usually what the object is doing. The context describes the surrounding environment.

If there is text found in the image, do your best to transcribe the important bits, even if it extends the word count beyond ${WORDS} words. It should not contain quotation marks, as those tend to cause issues when rendered on the web.

If there is no text found in the image, then there is no need to mention it.

You should not begin the description with any variation of “The image”.

Don't output the thinking process, just the final description.

${IMAGE_FILE}
EOT
)

PROMPT_ACTUAL="$PROMPT_SIMPLE"
if [[ "$MODE" == "extended" ]]; then
    PROMPT_ACTUAL="$PROMPT_EXTENDED"
fi

RESULT=$(ollama run "$MODEL" "$PROMPT_ACTUAL") || { echo "Error: ollama failed to generate description." >&2; exit 1; }

if [[ "$#" -eq 1 ]]; then
    echo "$RESULT"
    exit 0
fi

# If a second argument is provided, treat it as the target language
TARGET_LANGUAGE="$2"

# Map 'de' shortcode to full name for better prompt understanding, optional for others
if [[ "$TARGET_LANGUAGE" == "de" || "$TARGET_LANGUAGE" == "DE" ]]; then
    TARGET_LANGUAGE="German"
fi

TRANSLATED_RESULT=$(ollama run "$TRANSLATION_MODEL" "Translate the following text to $TARGET_LANGUAGE. Do not add any conversational filler, just provide the translation:\n\n$RESULT") || { echo "Error: ollama failed to translate description." >&2; exit 1; }

echo "$TRANSLATED_RESULT"
