import gradio as gr
from llama_cpp import Llama
import json

# 1. LOAD THE ENGINE
# We are using a 4-bit Quantized version of Llama 3.2 3B. 
# This is the "Owner's Engine" - it runs locally on your Space.
print("Loading Coretex Engine...")
llm = Llama.from_pretrained(
    repo_id="hugging-quants/Llama-3.2-3B-Instruct-Q8_0-GGUF",
    filename="llama-3.2-3b-instruct-q8_0.gguf",
    n_ctx=2048,  # Context window
    n_threads=2  # Matches the 2 vCPUs on Hugging Face Free
)

def load_knowledge():
    try:
        with open("knowledge.jsonl", "r") as f:
            return [json.loads(line) for line in f]
    except:
        return []

def coretex_chat(user_input):
    knowledge = load_knowledge()
    
    # Format your custom knowledge into the prompt
    context_str = "\n".join([f"Info: {k['context']} -> {k['response']}" for k in knowledge])
    
    prompt = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n" \
             f"You are Coretex. Use this custom knowledge:\n{context_str}<|eot_id|>" \
             f"<|start_header_id|>user<|end_header_id|>\n\n{user_input}<|eot_id|>" \
             f"<|start_header_id|>assistant<|end_header_id|>\n\n"

    # THE ENGINE THINKS HERE
    output = llm(prompt, max_tokens=150, stop=["<|eot_id|>"], echo=False)
    return output['choices'][0]['text']

demo = gr.Interface(fn=coretex_chat, inputs="text", outputs="text", title="Coretex Private Engine")
demo.queue().launch()