Spaces:

Mat17892
/

iris

Runtime error

App Files Files Community

iris / llama.cpp /examples /server /public /completion.js

Mat17892's picture

llamacpp

b664585 verified about 1 year ago

history blame contribute delete

6.51 kB

	const paramDefaults = {
	stream: true,
	temperature: 0.2,
	};

	let generation_settings = null;

	export class CompletionError extends Error {
	constructor(message, name, data) {
	super(message);
	this.name = name;
	}
	};

	// Completes the prompt as a generator. Recommended for most use cases.
	//
	// Example:
	//
	// import { llama } from '/completion.js'
	//
	// const request = llama("Tell me a joke", {n_predict: 800})
	// for await (const chunk of request) {
	// document.write(chunk.data.content)
	// }
	//
	export async function* llama(prompt, params = {}, config = {}) {
	let controller = config.controller;
	const api_url = config.api_url?.replace(/\/+$/, '') \|\| "";

	if (!controller) {
	controller = new AbortController();
	}

	const completionParams = { ...paramDefaults, ...params, prompt };

	const response = await fetch(`${api_url}${config.endpoint \|\| '/completion'}`, {
	method: 'POST',
	body: JSON.stringify(completionParams),
	headers: {
	'Connection': 'keep-alive',
	'Content-Type': 'application/json',
	'Accept': 'text/event-stream',
	...(params.api_key ? {'Authorization': `Bearer ${params.api_key}`} : {})
	},
	signal: controller.signal,
	});

	const status = response.status;
	if (status !== 200) {
	try {
	const body = await response.json();
	if (body && body.error && body.error.message) {
	throw new CompletionError(body.error.message, 'ServerError');
	}
	} catch (err) {
	throw new CompletionError(err.message, 'ServerError');
	}
	}

	const reader = response.body.getReader();
	const decoder = new TextDecoder();

	let content = "";
	let leftover = ""; // Buffer for partially read lines

	try {
	let cont = true;

	while (cont) {
	const result = await reader.read();
	if (result.done) {
	break;
	}

	// Add any leftover data to the current chunk of data
	const text = leftover + decoder.decode(result.value);

	// Check if the last character is a line break
	const endsWithLineBreak = text.endsWith('\n');

	// Split the text into lines
	let lines = text.split('\n');

	// If the text doesn't end with a line break, then the last line is incomplete
	// Store it in leftover to be added to the next chunk of data
	if (!endsWithLineBreak) {
	leftover = lines.pop();
	} else {
	leftover = ""; // Reset leftover if we have a line break at the end
	}

	// Parse all sse events and add them to result
	const regex = /^(\S+):\s(.*)$/gm;
	for (const line of lines) {
	const match = regex.exec(line);
	if (match) {
	result[match[1]] = match[2];
	if (result.data === '[DONE]') {
	cont = false;
	break;
	}

	// since we know this is llama.cpp, let's just decode the json in data
	if (result.data) {
	result.data = JSON.parse(result.data);
	content += result.data.content;

	// yield
	yield result;

	// if we got a stop token from server, we will break here
	if (result.data.stop) {
	if (result.data.generation_settings) {
	generation_settings = result.data.generation_settings;
	}
	cont = false;
	break;
	}
	}
	if (result.error) {
	try {
	result.error = JSON.parse(result.error);
	if (result.error.message.includes('slot unavailable')) {
	// Throw an error to be caught by upstream callers
	throw new Error('slot unavailable');
	} else {
	console.error(`llama.cpp error [${result.error.code} - ${result.error.type}]: ${result.error.message}`);
	}
	} catch(e) {
	console.error(`llama.cpp error ${result.error}`)
	}
	}
	}
	}
	}
	} catch (e) {
	if (e.name !== 'AbortError') {
	console.error("llama error: ", e);
	}
	throw e;
	}
	finally {
	controller.abort();
	}

	return content;
	}

	// Call llama, return an event target that you can subscribe to
	//
	// Example:
	//
	// import { llamaEventTarget } from '/completion.js'
	//
	// const conn = llamaEventTarget(prompt)
	// conn.addEventListener("message", (chunk) => {
	// document.write(chunk.detail.content)
	// })
	//
	export const llamaEventTarget = (prompt, params = {}, config = {}) => {
	const eventTarget = new EventTarget();
	(async () => {
	let content = "";
	for await (const chunk of llama(prompt, params, config)) {
	if (chunk.data) {
	content += chunk.data.content;
	eventTarget.dispatchEvent(new CustomEvent("message", { detail: chunk.data }));
	}
	if (chunk.data.generation_settings) {
	eventTarget.dispatchEvent(new CustomEvent("generation_settings", { detail: chunk.data.generation_settings }));
	}
	if (chunk.data.timings) {
	eventTarget.dispatchEvent(new CustomEvent("timings", { detail: chunk.data.timings }));
	}
	}
	eventTarget.dispatchEvent(new CustomEvent("done", { detail: { content } }));
	})();
	return eventTarget;
	}

	// Call llama, return a promise that resolves to the completed text. This does not support streaming
	//
	// Example:
	//
	// llamaPromise(prompt).then((content) => {
	// document.write(content)
	// })
	//
	// or
	//
	// const content = await llamaPromise(prompt)
	// document.write(content)
	//
	export const llamaPromise = (prompt, params = {}, config = {}) => {
	return new Promise(async (resolve, reject) => {
	let content = "";
	try {
	for await (const chunk of llama(prompt, params, config)) {
	content += chunk.data.content;
	}
	resolve(content);
	} catch (error) {
	reject(error);
	}
	});
	};

	/**
	* (deprecated)
	*/
	export const llamaComplete = async (params, controller, callback) => {
	for await (const chunk of llama(params.prompt, params, { controller })) {
	callback(chunk);
	}
	}

	// Get the model info from the server. This is useful for getting the context window and so on.
	export const llamaModelInfo = async (config = {}) => {
	if (!generation_settings) {
	const api_url = config.api_url?.replace(/\/+$/, '') \|\| "";
	const props = await fetch(`${api_url}/props`).then(r => r.json());
	generation_settings = props.default_generation_settings;
	}
	return generation_settings;
	}