describe-image.sh
· 2.7 KiB · Bash
Bruto
#!/usr/bin/env bash
set -o errexit
set -o pipefail
set -o nounset
# Check dependencies
if ! command -v ollama &> /dev/null; then
echo "Error: 'ollama' is not installed or not in PATH."
exit 1
fi
if [[ "$#" -ne 1 && "$#" -ne 2 ]]; then
echo "Usage: $0 <image> [target_language]"
echo "Example: $0 screenshot.png de"
exit 1
fi
IMAGE_FILE=$(realpath "$1")
if [[ ! -f "$IMAGE_FILE" ]]; then
echo "Error: File '$IMAGE_FILE' not found."
exit 1
fi
# Default configuration
# pass variables to override parameters, e.g.: WORDS=200 MODE=extended describe-image.sh image.jpg
MODEL=${MODEL:-qwen3-vl:8b}
MODE=${MODE:-simple}
WORDS=${WORDS:-100}
TRANSLATION_MODEL=${TRANSLATION_MODEL:-llama3.2}
if ! [[ "$WORDS" =~ ^[0-9]+$ ]]; then
echo "Error: WORDS must be numeric." >&2
exit 1
fi
if [[ "$MODE" != "simple" && "$MODE" != "extended" ]]; then
echo "Error: MODE must be either 'simple' or 'extended'." >&2
exit 1
fi
PROMPT_SIMPLE="describe image ${IMAGE_FILE}"
PROMPT_EXTENDED=$(cat <<EOT
Please provide a functional, objective description of the provided image in no more than around ${WORDS} so that someone who could not see it would be able to imagine it. If possible, follow an “object-action-context” framework. The object is the main focus. The action describes what’s happening, usually what the object is doing. The context describes the surrounding environment.
If there is text found in the image, do your best to transcribe the important bits, even if it extends the word count beyond ${WORDS} words. It should not contain quotation marks, as those tend to cause issues when rendered on the web.
If there is no text found in the image, then there is no need to mention it.
You should not begin the description with any variation of “The image”.
Don't output the thinking process, just the final description.
${IMAGE_FILE}
EOT
)
PROMPT_ACTUAL="$PROMPT_SIMPLE"
if [[ "$MODE" == "extended" ]]; then
PROMPT_ACTUAL="$PROMPT_EXTENDED"
fi
RESULT=$(ollama run "$MODEL" "$PROMPT_ACTUAL") || { echo "Error: ollama failed to generate description." >&2; exit 1; }
if [[ "$#" -eq 1 ]]; then
echo "$RESULT"
exit 0
fi
# If a second argument is provided, treat it as the target language
TARGET_LANGUAGE="$2"
# Map 'de' shortcode to full name for better prompt understanding, optional for others
if [[ "$TARGET_LANGUAGE" == "de" || "$TARGET_LANGUAGE" == "DE" ]]; then
TARGET_LANGUAGE="German"
fi
TRANSLATED_RESULT=$(ollama run "$TRANSLATION_MODEL" "Translate the following text to $TARGET_LANGUAGE. Do not add any conversational filler, just provide the translation:\n\n$RESULT") || { echo "Error: ollama failed to translate description." >&2; exit 1; }
echo "$TRANSLATED_RESULT"
| 1 | #!/usr/bin/env bash |
| 2 | |
| 3 | set -o errexit |
| 4 | set -o pipefail |
| 5 | set -o nounset |
| 6 | |
| 7 | # Check dependencies |
| 8 | if ! command -v ollama &> /dev/null; then |
| 9 | echo "Error: 'ollama' is not installed or not in PATH." |
| 10 | exit 1 |
| 11 | fi |
| 12 | |
| 13 | if [[ "$#" -ne 1 && "$#" -ne 2 ]]; then |
| 14 | echo "Usage: $0 <image> [target_language]" |
| 15 | echo "Example: $0 screenshot.png de" |
| 16 | exit 1 |
| 17 | fi |
| 18 | |
| 19 | IMAGE_FILE=$(realpath "$1") |
| 20 | |
| 21 | if [[ ! -f "$IMAGE_FILE" ]]; then |
| 22 | echo "Error: File '$IMAGE_FILE' not found." |
| 23 | exit 1 |
| 24 | fi |
| 25 | |
| 26 | # Default configuration |
| 27 | # pass variables to override parameters, e.g.: WORDS=200 MODE=extended describe-image.sh image.jpg |
| 28 | MODEL=${MODEL:-qwen3-vl:8b} |
| 29 | MODE=${MODE:-simple} |
| 30 | WORDS=${WORDS:-100} |
| 31 | TRANSLATION_MODEL=${TRANSLATION_MODEL:-llama3.2} |
| 32 | |
| 33 | if ! [[ "$WORDS" =~ ^[0-9]+$ ]]; then |
| 34 | echo "Error: WORDS must be numeric." >&2 |
| 35 | exit 1 |
| 36 | fi |
| 37 | |
| 38 | if [[ "$MODE" != "simple" && "$MODE" != "extended" ]]; then |
| 39 | echo "Error: MODE must be either 'simple' or 'extended'." >&2 |
| 40 | exit 1 |
| 41 | fi |
| 42 | |
| 43 | PROMPT_SIMPLE="describe image ${IMAGE_FILE}" |
| 44 | PROMPT_EXTENDED=$(cat <<EOT |
| 45 | Please provide a functional, objective description of the provided image in no more than around ${WORDS} so that someone who could not see it would be able to imagine it. If possible, follow an “object-action-context” framework. The object is the main focus. The action describes what’s happening, usually what the object is doing. The context describes the surrounding environment. |
| 46 | |
| 47 | If there is text found in the image, do your best to transcribe the important bits, even if it extends the word count beyond ${WORDS} words. It should not contain quotation marks, as those tend to cause issues when rendered on the web. |
| 48 | |
| 49 | If there is no text found in the image, then there is no need to mention it. |
| 50 | |
| 51 | You should not begin the description with any variation of “The image”. |
| 52 | |
| 53 | Don't output the thinking process, just the final description. |
| 54 | |
| 55 | ${IMAGE_FILE} |
| 56 | EOT |
| 57 | ) |
| 58 | |
| 59 | PROMPT_ACTUAL="$PROMPT_SIMPLE" |
| 60 | if [[ "$MODE" == "extended" ]]; then |
| 61 | PROMPT_ACTUAL="$PROMPT_EXTENDED" |
| 62 | fi |
| 63 | |
| 64 | RESULT=$(ollama run "$MODEL" "$PROMPT_ACTUAL") || { echo "Error: ollama failed to generate description." >&2; exit 1; } |
| 65 | |
| 66 | if [[ "$#" -eq 1 ]]; then |
| 67 | echo "$RESULT" |
| 68 | exit 0 |
| 69 | fi |
| 70 | |
| 71 | # If a second argument is provided, treat it as the target language |
| 72 | TARGET_LANGUAGE="$2" |
| 73 | |
| 74 | # Map 'de' shortcode to full name for better prompt understanding, optional for others |
| 75 | if [[ "$TARGET_LANGUAGE" == "de" || "$TARGET_LANGUAGE" == "DE" ]]; then |
| 76 | TARGET_LANGUAGE="German" |
| 77 | fi |
| 78 | |
| 79 | TRANSLATED_RESULT=$(ollama run "$TRANSLATION_MODEL" "Translate the following text to $TARGET_LANGUAGE. Do not add any conversational filler, just provide the translation:\n\n$RESULT") || { echo "Error: ollama failed to translate description." >&2; exit 1; } |
| 80 | |
| 81 | echo "$TRANSLATED_RESULT" |
| 82 |