| | import torch |
| | from typing import Dict, List, Any |
| | from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline |
| |
|
| | |
| | device = 0 if torch.cuda.is_available() else -1 |
| |
|
| |
|
| | format_input = ( |
| | "Below is an instruction that describes a task. " |
| | "Write a response that appropriately completes the request.\n\n" |
| | "### Instruction:\n{instruction}\n\n### Response:" |
| | ) |
| |
|
| |
|
| | class EndpointHandler: |
| | def __init__(self, path=""): |
| | |
| | tokenizer = AutoTokenizer.from_pretrained(path) |
| | model = AutoModelForCausalLM.from_pretrained( |
| | path, |
| | device_map="auto", |
| | torch_dtype=torch.float16, |
| | ) |
| | |
| | self.pipeline = pipeline( |
| | "text-generation", |
| | model=model, |
| | tokenizer=tokenizer, |
| | device=device, |
| | max_length=256, |
| | ) |
| |
|
| | def __call__(self, data: Any) -> List[List[Dict[str, float]]]: |
| | inputs = data.pop("inputs", data) |
| | parameters = data.pop("parameters", None) |
| |
|
| | text_input = format_input.format(instruction=inputs) |
| |
|
| | |
| | if parameters is not None: |
| | prediction = self.pipeline(text_input, **parameters) |
| | else: |
| | prediction = self.pipeline(text_input) |
| |
|
| | |
| | output = [ |
| | {"generated_text": pred["generated_text"].split("### Response:")[1].strip()} |
| | for pred in prediction |
| | ] |
| |
|
| | return output |