import aiohttp
import asyncio
import time
import json
import os
from typing import Dict, Any


async def measure_performance(api_endpoint: str, prompt: str) -> Dict[str, Any]:
    """
    Measures the performance of an API endpoint by sending a prompt and recording metrics.

    Args:
        api_endpoint (str): The API endpoint URL.
        prompt (str): The prompt to send to the API.

    Returns:
        Dict[str, Any]: A dictionary containing performance metrics or error information.
    """
    results: Dict[str, Any] = {}
    request_payload = {
        "model": "llama-3.2-3b",
        "messages": [{"role": "user", "content": prompt}],
        "temperature": 0,
        "stream": True
    }

    async with aiohttp.ClientSession() as session:
        try:
            start_time = time.time()
            first_token_time = None
            total_tokens = 0

            async with session.post(api_endpoint, json=request_payload) as response:
                if response.status != 200:
                    results["error"] = f"HTTP {response.status}: {response.reason}"
                    return results

                async for raw_line in response.content:
                    line = raw_line.decode('utf-8').strip()
                    if not line or not line.startswith('data: '):
                        continue

                    line_content = line[6:]  # Remove 'data: ' prefix
                    if line_content == '[DONE]':
                        break

                    try:
                        chunk = json.loads(line_content)
                        choice = chunk.get('choices', [{}])[0]
                        content = choice.get('delta', {}).get('content')

                        if content:
                            if first_token_time is None:
                                first_token_time = time.time()
                                results["time_to_first_token"] = first_token_time - start_time

                            total_tokens += 1
                    except json.JSONDecodeError:
                        # Log or handle malformed JSON if necessary
                        continue

            end_time = time.time()
            total_time = end_time - start_time

            if total_tokens > 0:
                results.update({
                    "tokens_per_second": total_tokens / total_time,
                    "total_tokens": total_tokens,
                    "total_time": total_time
                })
            else:
                results["error"] = "No tokens were generated"

        except aiohttp.ClientError as e:
            results["error"] = f"Client error: {e}"
        except Exception as e:
            results["error"] = f"Unexpected error: {e}"

    return results


async def main() -> None:
    api_endpoint = "http://localhost:52415/v1/chat/completions"

    # Define prompts
    prompt_basic = "this is a ping"
    prompt_essay = "write an essay about cats"

    # Measure performance for the basic prompt
    print("Measuring performance for the basic prompt...")
    results_basic = await measure_performance(api_endpoint, prompt_basic)
    print("Basic prompt performance metrics:")
    print(json.dumps(results_basic, indent=4))

    # Measure performance for the essay prompt, which depends on the first measurement
    print("\nMeasuring performance for the essay prompt...")
    results = await measure_performance(api_endpoint, prompt_essay)

    # Save metrics from the "universe and everything" prompt
    metrics_file = os.path.join("artifacts", "benchmark.json")
    os.makedirs(os.path.dirname(metrics_file), exist_ok=True)
    try:
        with open(metrics_file, "w", encoding="utf-8") as f:
            json.dump(results, f, indent=4)
        print(f"Performance metrics saved to {metrics_file}")
    except IOError as e:
        print(f"Failed to save metrics: {e}")

    # Optionally print the metrics for visibility
    print("Performance metrics:")
    print(json.dumps(results, indent=4))


if __name__ == "__main__":
    asyncio.run(main())