vlm-to-md.py

by kaeru — last modified 2025-12-02T21:01:24+08:00
File contents

#!/usr/bin/env python3
import argparse
import requests
import sys
import base64
from pathlib import Path
import json

def encode_image_to_base64(image_path):
    """Encode image to base64 string for API transmission"""
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def process_image_with_local_server(image_path, prompt=None, server_url="http://localhost:8080/v1/chat/completions"):
    """
    Send image to local llama.cpp server for processing
    
    Args:
        image_path (str): Path to the image file
        prompt (str, optional): Custom prompt for the API
        server_url (str): URL of the local llama.cpp server
    
    Returns:
        dict: API response
    """
    # Validate image file exists
    if not Path(image_path).exists():
        raise FileNotFoundError(f"Image file not found: {image_path}")
    
    # Encode image to base64
    encoded_image = encode_image_to_base64(image_path)
    
    # Prepare the request payload (llama.cpp format)
    payload = {
        "model": "llava",  # or whatever model you're using
        "messages": [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": prompt or "Analyze this image."
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{encoded_image}"
                        }
                    }
                ]
            }
        ],
        "max_tokens": 16384
    }
    
    try:
        # Send POST request to local server
        response = requests.post(
            server_url,
            json=payload,
            timeout=30
        )
        response.raise_for_status()  # Raise exception for bad status codes
        return response.json()
    
    except requests.exceptions.RequestException as e:
        print(f"Local server request failed: {e}", file=sys.stderr)
        sys.exit(1)

def save_to_markdown(output_text, image_path):
    """
    Save the output text to a markdown file with same name but .md extension
    
    Args:
        output_text (str): The text to save
        image_path (str): Path to the original image file
    """
    # Create output filename with .md extension
    image_file = Path(image_path)
    output_file = image_file.with_suffix('.md')
    
    # Write to markdown file
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(output_text)
    
    print(f"Output saved to: {output_file}")

def main():
    parser = argparse.ArgumentParser(
        description='Send image to local llama.cpp server for processing',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  %(prog)s image.jpg
  %(prog)s image.jpg --prompt "Describe what's in the image"
  %(prog)s image.jpg --prompt "Generate a caption" --server-url "http://localhost:8080/v1/chat/completions"
        """
    )
    
    parser.add_argument(
        'image',
        help='Path to the image file to process'
    )
    
    parser.add_argument(
        '--prompt',
        help='Custom prompt for the local server processing',
        default=None
    )
    
    parser.add_argument(
        '--server-url',
        help='URL of local llama.cpp server (default: http://localhost:8080/v1/chat/completions)',
        default="http://localhost:8000/v1/chat/completions"
    )
    
    parser.add_argument(
        '--no-output-file',
        action='store_true',
        help='Do not save output to markdown file (only print to stdout)'
    )
    
    args = parser.parse_args()
    
    try:
        # Process the image with local server
        result = process_image_with_local_server(
            image_path=args.image,
            prompt=args.prompt,
            server_url=args.server_url
        )
        
        # Extract and output the response content
        if 'choices' in result and len(result['choices']) > 0:
            response_text = result['choices'][0]['message']['content']
            
            # Print to stdout
            print(response_text)
            
            # Save to markdown file if not disabled
            if not args.no_output_file:
                save_to_markdown(response_text, args.image)
        else:
            # If response is different format, output raw JSON and save it
            json_output = json.dumps(result, indent=2)
            print(json_output)
            
            if not args.no_output_file:
                save_to_markdown(json_output, args.image)
                
    except Exception as e:
        print(f"Error: {e}", file=sys.stderr)
        sys.exit(1)

if __name__ == '__main__':
    main()