vlm-to-md.py
by kaeru
—
last modified
2025-12-02T21:01:24+08:00
vlm-to-md.py
— 4.7 KB
File contents
#!/usr/bin/env python3
import argparse
import requests
import sys
import base64
from pathlib import Path
import json
def encode_image_to_base64(image_path):
"""Encode image to base64 string for API transmission"""
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
def process_image_with_local_server(image_path, prompt=None, server_url="http://localhost:8080/v1/chat/completions"):
"""
Send image to local llama.cpp server for processing
Args:
image_path (str): Path to the image file
prompt (str, optional): Custom prompt for the API
server_url (str): URL of the local llama.cpp server
Returns:
dict: API response
"""
# Validate image file exists
if not Path(image_path).exists():
raise FileNotFoundError(f"Image file not found: {image_path}")
# Encode image to base64
encoded_image = encode_image_to_base64(image_path)
# Prepare the request payload (llama.cpp format)
payload = {
"model": "llava", # or whatever model you're using
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": prompt or "Analyze this image."
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{encoded_image}"
}
}
]
}
],
"max_tokens": 16384
}
try:
# Send POST request to local server
response = requests.post(
server_url,
json=payload,
timeout=30
)
response.raise_for_status() # Raise exception for bad status codes
return response.json()
except requests.exceptions.RequestException as e:
print(f"Local server request failed: {e}", file=sys.stderr)
sys.exit(1)
def save_to_markdown(output_text, image_path):
"""
Save the output text to a markdown file with same name but .md extension
Args:
output_text (str): The text to save
image_path (str): Path to the original image file
"""
# Create output filename with .md extension
image_file = Path(image_path)
output_file = image_file.with_suffix('.md')
# Write to markdown file
with open(output_file, 'w', encoding='utf-8') as f:
f.write(output_text)
print(f"Output saved to: {output_file}")
def main():
parser = argparse.ArgumentParser(
description='Send image to local llama.cpp server for processing',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
%(prog)s image.jpg
%(prog)s image.jpg --prompt "Describe what's in the image"
%(prog)s image.jpg --prompt "Generate a caption" --server-url "http://localhost:8080/v1/chat/completions"
"""
)
parser.add_argument(
'image',
help='Path to the image file to process'
)
parser.add_argument(
'--prompt',
help='Custom prompt for the local server processing',
default=None
)
parser.add_argument(
'--server-url',
help='URL of local llama.cpp server (default: http://localhost:8080/v1/chat/completions)',
default="http://localhost:8000/v1/chat/completions"
)
parser.add_argument(
'--no-output-file',
action='store_true',
help='Do not save output to markdown file (only print to stdout)'
)
args = parser.parse_args()
try:
# Process the image with local server
result = process_image_with_local_server(
image_path=args.image,
prompt=args.prompt,
server_url=args.server_url
)
# Extract and output the response content
if 'choices' in result and len(result['choices']) > 0:
response_text = result['choices'][0]['message']['content']
# Print to stdout
print(response_text)
# Save to markdown file if not disabled
if not args.no_output_file:
save_to_markdown(response_text, args.image)
else:
# If response is different format, output raw JSON and save it
json_output = json.dumps(result, indent=2)
print(json_output)
if not args.no_output_file:
save_to_markdown(json_output, args.image)
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
main()