describe-image.sh

describe-image.sh · 2.7 KiB · Bash Bruto

#!/usr/bin/env bash set -o errexit set -o pipefail set -o nounset # Check dependencies if ! command -v ollama &> /dev/null; then echo "Error: 'ollama' is not installed or not in PATH." exit 1 fi if [[ "$#" -ne 1 && "$#" -ne 2 ]]; then echo "Usage: $0 <image> [target_language]" echo "Example: $0 screenshot.png de" exit 1 fi IMAGE_FILE=$(realpath "$1") if [[ ! -f "$IMAGE_FILE" ]]; then echo "Error: File '$IMAGE_FILE' not found." exit 1 fi # Default configuration # pass variables to override parameters, e.g.: WORDS=200 MODE=extended describe-image.sh image.jpg MODEL=${MODEL:-qwen3-vl:8b} MODE=${MODE:-simple} WORDS=${WORDS:-100} TRANSLATION_MODEL=${TRANSLATION_MODEL:-llama3.2} if ! [[ "$WORDS" =~ ^[0-9]+$ ]]; then echo "Error: WORDS must be numeric." >&2 exit 1 fi if [[ "$MODE" != "simple" && "$MODE" != "extended" ]]; then echo "Error: MODE must be either 'simple' or 'extended'." >&2 exit 1 fi PROMPT_SIMPLE="describe image ${IMAGE_FILE}" PROMPT_EXTENDED=$(cat <<EOT Please provide a functional, objective description of the provided image in no more than around ${WORDS} so that someone who could not see it would be able to imagine it. If possible, follow an “object-action-context” framework. The object is the main focus. The action describes what’s happening, usually what the object is doing. The context describes the surrounding environment. If there is text found in the image, do your best to transcribe the important bits, even if it extends the word count beyond ${WORDS} words. It should not contain quotation marks, as those tend to cause issues when rendered on the web. If there is no text found in the image, then there is no need to mention it. You should not begin the description with any variation of “The image”. Don't output the thinking process, just the final description. ${IMAGE_FILE} EOT ) PROMPT_ACTUAL="$PROMPT_SIMPLE" if [[ "$MODE" == "extended" ]]; then PROMPT_ACTUAL="$PROMPT_EXTENDED" fi RESULT=$(ollama run "$MODEL" "$PROMPT_ACTUAL") || { echo "Error: ollama failed to generate description." >&2; exit 1; } if [[ "$#" -eq 1 ]]; then echo "$RESULT" exit 0 fi # If a second argument is provided, treat it as the target language TARGET_LANGUAGE="$2" # Map 'de' shortcode to full name for better prompt understanding, optional for others if [[ "$TARGET_LANGUAGE" == "de" || "$TARGET_LANGUAGE" == "DE" ]]; then TARGET_LANGUAGE="German" fi TRANSLATED_RESULT=$(ollama run "$TRANSLATION_MODEL" "Translate the following text to $TARGET_LANGUAGE. Do not add any conversational filler, just provide the translation:\n\n$RESULT") || { echo "Error: ollama failed to translate description." >&2; exit 1; } echo "$TRANSLATED_RESULT"

1	#!/usr/bin/env bash
2
3	set -o errexit
4	set -o pipefail
5	set -o nounset
6
7	# Check dependencies
8	if ! command -v ollama &> /dev/null; then
9	echo "Error: 'ollama' is not installed or not in PATH."
10	exit 1
11	fi
12
13	if [[ "$#" -ne 1 && "$#" -ne 2 ]]; then
14	echo "Usage: $0 <image> [target_language]"
15	echo "Example: $0 screenshot.png de"
16	exit 1
17	fi
18
19	IMAGE_FILE=$(realpath "$1")
20
21	if [[ ! -f "$IMAGE_FILE" ]]; then
22	echo "Error: File '$IMAGE_FILE' not found."
23	exit 1
24	fi
25
26	# Default configuration
27	# pass variables to override parameters, e.g.: WORDS=200 MODE=extended describe-image.sh image.jpg
28	MODEL=${MODEL:-qwen3-vl:8b}
29	MODE=${MODE:-simple}
30	WORDS=${WORDS:-100}
31	TRANSLATION_MODEL=${TRANSLATION_MODEL:-llama3.2}
32
33	if ! [[ "$WORDS" =~ ^[0-9]+$ ]]; then
34	echo "Error: WORDS must be numeric." >&2
35	exit 1
36	fi
37
38	if [[ "$MODE" != "simple" && "$MODE" != "extended" ]]; then
39	echo "Error: MODE must be either 'simple' or 'extended'." >&2
40	exit 1
41	fi
42
43	PROMPT_SIMPLE="describe image ${IMAGE_FILE}"
44	PROMPT_EXTENDED=$(cat <<EOT
45	Please provide a functional, objective description of the provided image in no more than around ${WORDS} so that someone who could not see it would be able to imagine it. If possible, follow an “object-action-context” framework. The object is the main focus. The action describes what’s happening, usually what the object is doing. The context describes the surrounding environment.
46
47	If there is text found in the image, do your best to transcribe the important bits, even if it extends the word count beyond ${WORDS} words. It should not contain quotation marks, as those tend to cause issues when rendered on the web.
48
49	If there is no text found in the image, then there is no need to mention it.
50
51	You should not begin the description with any variation of “The image”.
52
53	Don't output the thinking process, just the final description.
54
55	${IMAGE_FILE}
56	EOT
57	)
58
59	PROMPT_ACTUAL="$PROMPT_SIMPLE"
60	if [[ "$MODE" == "extended" ]]; then
61	PROMPT_ACTUAL="$PROMPT_EXTENDED"
62	fi
63
64	RESULT=$(ollama run "$MODEL" "$PROMPT_ACTUAL") \|\| { echo "Error: ollama failed to generate description." >&2; exit 1; }
65
66	if [[ "$#" -eq 1 ]]; then
67	echo "$RESULT"
68	exit 0
69	fi
70
71	# If a second argument is provided, treat it as the target language
72	TARGET_LANGUAGE="$2"
73
74	# Map 'de' shortcode to full name for better prompt understanding, optional for others
75	if [[ "$TARGET_LANGUAGE" == "de" \|\| "$TARGET_LANGUAGE" == "DE" ]]; then
76	TARGET_LANGUAGE="German"
77	fi
78
79	TRANSLATED_RESULT=$(ollama run "$TRANSLATION_MODEL" "Translate the following text to $TARGET_LANGUAGE. Do not add any conversational filler, just provide the translation:\n\n$RESULT") \|\| { echo "Error: ollama failed to translate description." >&2; exit 1; }
80
81	echo "$TRANSLATED_RESULT"
82