import gradio as gr from llama_cpp import Llama import json # 1. LOAD THE ENGINE # We are using a 4-bit Quantized version of Llama 3.2 3B. # This is the "Owner's Engine" - it runs locally on your Space. print("Loading Coretex Engine...") llm = Llama.from_pretrained( repo_id="hugging-quants/Llama-3.2-3B-Instruct-Q8_0-GGUF", filename="llama-3.2-3b-instruct-q8_0.gguf", n_ctx=2048, # Context window n_threads=2 # Matches the 2 vCPUs on Hugging Face Free ) def load_knowledge(): try: with open("knowledge.jsonl", "r") as f: return [json.loads(line) for line in f] except: return [] def coretex_chat(user_input): knowledge = load_knowledge() # Format your custom knowledge into the prompt context_str = "\n".join([f"Info: {k['context']} -> {k['response']}" for k in knowledge]) prompt = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n" \ f"You are Coretex. Use this custom knowledge:\n{context_str}<|eot_id|>" \ f"<|start_header_id|>user<|end_header_id|>\n\n{user_input}<|eot_id|>" \ f"<|start_header_id|>assistant<|end_header_id|>\n\n" # THE ENGINE THINKS HERE output = llm(prompt, max_tokens=150, stop=["<|eot_id|>"], echo=False) return output['choices'][0]['text'] demo = gr.Interface(fn=coretex_chat, inputs="text", outputs="text", title="Coretex Private Engine") demo.queue().launch()