Coretex-API / app.py
itamar11's picture
Update app.py
b5870f4 verified
import gradio as gr
from llama_cpp import Llama
import json
# 1. LOAD THE ENGINE
# We are using a 4-bit Quantized version of Llama 3.2 3B.
# This is the "Owner's Engine" - it runs locally on your Space.
print("Loading Coretex Engine...")
llm = Llama.from_pretrained(
repo_id="hugging-quants/Llama-3.2-3B-Instruct-Q8_0-GGUF",
filename="llama-3.2-3b-instruct-q8_0.gguf",
n_ctx=2048, # Context window
n_threads=2 # Matches the 2 vCPUs on Hugging Face Free
)
def load_knowledge():
try:
with open("knowledge.jsonl", "r") as f:
return [json.loads(line) for line in f]
except:
return []
def coretex_chat(user_input):
knowledge = load_knowledge()
# Format your custom knowledge into the prompt
context_str = "\n".join([f"Info: {k['context']} -> {k['response']}" for k in knowledge])
prompt = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n" \
f"You are Coretex. Use this custom knowledge:\n{context_str}<|eot_id|>" \
f"<|start_header_id|>user<|end_header_id|>\n\n{user_input}<|eot_id|>" \
f"<|start_header_id|>assistant<|end_header_id|>\n\n"
# THE ENGINE THINKS HERE
output = llm(prompt, max_tokens=150, stop=["<|eot_id|>"], echo=False)
return output['choices'][0]['text']
demo = gr.Interface(fn=coretex_chat, inputs="text", outputs="text", title="Coretex Private Engine")
demo.queue().launch()