| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| """ |
| Script d'évaluation pour le modèle n8n Expert. |
| |
| Métriques: |
| 1. JSON Validity - Le output est-il du JSON valide? |
| 2. Schema Compliance - Le workflow suit-il le schéma n8n? |
| 3. Node Accuracy - Les types de nodes sont-ils corrects? |
| 4. Connection Logic - Les connexions sont-elles cohérentes? |
| 5. Thinking Quality - Le raisonnement est-il présent et structuré? |
| |
| Usage: |
| python eval_n8n_model.py --model stmasson/n8n-expert-14b --samples 100 |
| """ |
|
|
| import os |
| import json |
| import argparse |
| import re |
| from typing import Dict, List, Any, Tuple |
| from dataclasses import dataclass |
| from tqdm import tqdm |
| import pandas as pd |
| import torch |
| from datasets import load_dataset |
| from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline |
| from huggingface_hub import login |
|
|
| |
| |
| |
|
|
| |
| VALID_NODE_TYPES = { |
| |
| "n8n-nodes-base.webhookTrigger", |
| "n8n-nodes-base.scheduleTrigger", |
| "n8n-nodes-base.manualTrigger", |
| "n8n-nodes-base.emailTrigger", |
| |
| "n8n-nodes-base.httpRequest", |
| "n8n-nodes-base.set", |
| "n8n-nodes-base.if", |
| "n8n-nodes-base.switch", |
| "n8n-nodes-base.merge", |
| "n8n-nodes-base.splitInBatches", |
| "n8n-nodes-base.function", |
| "n8n-nodes-base.code", |
| "n8n-nodes-base.noOp", |
| |
| "n8n-nodes-base.slack", |
| "n8n-nodes-base.gmail", |
| "n8n-nodes-base.googleSheets", |
| "n8n-nodes-base.airtable", |
| "n8n-nodes-base.notion", |
| "n8n-nodes-base.discord", |
| "n8n-nodes-base.telegram", |
| "n8n-nodes-base.openAi", |
| "n8n-nodes-base.postgres", |
| "n8n-nodes-base.mysql", |
| "n8n-nodes-base.mongodb", |
| |
| "@n8n/n8n-nodes-langchain.agent", |
| "@n8n/n8n-nodes-langchain.chainLlm", |
| } |
|
|
| |
| |
| |
|
|
| @dataclass |
| class EvalResult: |
| """Résultat d'évaluation pour un exemple""" |
| task_type: str |
| valid_json: bool |
| has_nodes: bool |
| has_connections: bool |
| nodes_valid: bool |
| has_thinking: bool |
| thinking_structured: bool |
| error: str = "" |
|
|
| @property |
| def score(self) -> float: |
| """Score global 0-1""" |
| scores = [ |
| self.valid_json, |
| self.has_nodes, |
| self.has_connections, |
| self.nodes_valid, |
| self.has_thinking, |
| self.thinking_structured, |
| ] |
| return sum(scores) / len(scores) |
|
|
|
|
| def extract_workflow_json(text: str) -> Tuple[str, str]: |
| """ |
| Extrait le JSON du workflow et le thinking de la réponse. |
| Retourne (thinking, workflow_json) |
| """ |
| thinking = "" |
| workflow_json = "" |
|
|
| |
| thinking_match = re.search(r'<thinking>(.*?)</thinking>', text, re.DOTALL) |
| if thinking_match: |
| thinking = thinking_match.group(1).strip() |
|
|
| |
| |
| json_block = re.search(r'```json\s*(.*?)\s*```', text, re.DOTALL) |
| if json_block: |
| workflow_json = json_block.group(1).strip() |
| else: |
| |
| after_thinking = text |
| if thinking_match: |
| after_thinking = text[thinking_match.end():] |
|
|
| |
| json_match = re.search(r'\{[\s\S]*\}', after_thinking) |
| if json_match: |
| workflow_json = json_match.group(0).strip() |
|
|
| return thinking, workflow_json |
|
|
|
|
| def validate_workflow(workflow_json: str) -> Dict[str, Any]: |
| """Valide un workflow n8n""" |
| result = { |
| "valid_json": False, |
| "has_nodes": False, |
| "has_connections": False, |
| "nodes_valid": False, |
| "node_count": 0, |
| "connection_count": 0, |
| "invalid_nodes": [], |
| } |
|
|
| |
| try: |
| wf = json.loads(workflow_json) |
| result["valid_json"] = True |
| except json.JSONDecodeError as e: |
| result["error"] = str(e) |
| return result |
|
|
| |
| nodes = wf.get("nodes", []) |
| result["has_nodes"] = len(nodes) > 0 |
| result["node_count"] = len(nodes) |
|
|
| |
| connections = wf.get("connections", {}) |
| result["has_connections"] = len(connections) > 0 |
| result["connection_count"] = sum(len(v) for v in connections.values()) |
|
|
| |
| invalid_nodes = [] |
| for node in nodes: |
| node_type = node.get("type", "") |
| if node_type and node_type not in VALID_NODE_TYPES: |
| |
| if not (node_type.startswith("n8n-nodes-base.") or |
| node_type.startswith("@n8n/")): |
| invalid_nodes.append(node_type) |
|
|
| result["invalid_nodes"] = invalid_nodes |
| result["nodes_valid"] = len(invalid_nodes) == 0 |
|
|
| return result |
|
|
|
|
| def validate_thinking(thinking: str) -> Dict[str, bool]: |
| """Valide la qualité du thinking""" |
| result = { |
| "has_thinking": len(thinking) > 50, |
| "thinking_structured": False, |
| } |
|
|
| |
| if thinking: |
| has_structure = ( |
| re.search(r'\d+\.', thinking) is not None or |
| re.search(r'^-\s', thinking, re.MULTILINE) is not None or |
| re.search(r'^\*\s', thinking, re.MULTILINE) is not None or |
| "étape" in thinking.lower() or |
| "step" in thinking.lower() |
| ) |
| result["thinking_structured"] = has_structure |
|
|
| return result |
|
|
|
|
| def evaluate_example( |
| model_output: str, |
| task_type: str, |
| ) -> EvalResult: |
| """Évalue un exemple généré par le modèle""" |
| |
| thinking, workflow_json = extract_workflow_json(model_output) |
|
|
| |
| wf_validation = validate_workflow(workflow_json) |
|
|
| |
| thinking_validation = validate_thinking(thinking) |
|
|
| return EvalResult( |
| task_type=task_type, |
| valid_json=wf_validation["valid_json"], |
| has_nodes=wf_validation["has_nodes"], |
| has_connections=wf_validation["has_connections"], |
| nodes_valid=wf_validation["nodes_valid"], |
| has_thinking=thinking_validation["has_thinking"], |
| thinking_structured=thinking_validation["thinking_structured"], |
| error=wf_validation.get("error", ""), |
| ) |
|
|
|
|
| |
| |
| |
|
|
| def run_evaluation( |
| model_path: str, |
| dataset_repo: str = "stmasson/n8n-agentic-multitask", |
| data_file: str = "data/multitask_large/val.jsonl", |
| num_samples: int = 100, |
| output_file: str = "eval_results.json", |
| ): |
| """Lance l'évaluation complète du modèle""" |
|
|
| print("=" * 60) |
| print("ÉVALUATION DU MODÈLE N8N EXPERT") |
| print("=" * 60) |
|
|
| |
| hf_token = os.environ.get("HF_TOKEN") |
| if hf_token: |
| login(token=hf_token) |
|
|
| |
| print(f"\nChargement du modèle: {model_path}") |
| tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) |
| model = AutoModelForCausalLM.from_pretrained( |
| model_path, |
| torch_dtype=torch.bfloat16, |
| device_map="auto", |
| trust_remote_code=True, |
| ) |
|
|
| pipe = pipeline( |
| "text-generation", |
| model=model, |
| tokenizer=tokenizer, |
| device_map="auto", |
| ) |
|
|
| |
| print(f"\nChargement du dataset: {dataset_repo}") |
| dataset = load_dataset( |
| dataset_repo, |
| data_files={"validation": data_file}, |
| split="validation" |
| ) |
|
|
| |
| if num_samples < len(dataset): |
| dataset = dataset.shuffle(seed=42).select(range(num_samples)) |
|
|
| print(f"Évaluation sur {len(dataset)} exemples") |
|
|
| |
| results = [] |
| task_counts = {} |
|
|
| for example in tqdm(dataset, desc="Évaluation"): |
| messages = example["messages"] |
|
|
| |
| system_msg = messages[0]["content"] if messages else "" |
| if "génère" in system_msg.lower() or "generate" in system_msg.lower(): |
| task_type = "generate" |
| elif "édite" in system_msg.lower() or "edit" in system_msg.lower(): |
| task_type = "edit" |
| elif "corrige" in system_msg.lower() or "fix" in system_msg.lower(): |
| task_type = "fix" |
| elif "améliore" in system_msg.lower() or "improve" in system_msg.lower(): |
| task_type = "improve" |
| elif "explique" in system_msg.lower() or "explain" in system_msg.lower(): |
| task_type = "explain" |
| elif "débogue" in system_msg.lower() or "debug" in system_msg.lower(): |
| task_type = "debug" |
| else: |
| task_type = "unknown" |
|
|
| task_counts[task_type] = task_counts.get(task_type, 0) + 1 |
|
|
| |
| prompt = tokenizer.apply_chat_template( |
| messages[:-1], |
| tokenize=False, |
| add_generation_prompt=True, |
| ) |
|
|
| |
| try: |
| output = pipe( |
| prompt, |
| max_new_tokens=4096, |
| do_sample=False, |
| temperature=None, |
| top_p=None, |
| return_full_text=False, |
| ) |
| generated = output[0]["generated_text"] |
| except Exception as e: |
| generated = f"ERROR: {str(e)}" |
|
|
| |
| eval_result = evaluate_example(generated, task_type) |
| results.append(eval_result) |
|
|
| |
| print("\n" + "=" * 60) |
| print("RÉSULTATS") |
| print("=" * 60) |
|
|
| total = len(results) |
|
|
| |
| metrics = { |
| "valid_json": sum(r.valid_json for r in results) / total, |
| "has_nodes": sum(r.has_nodes for r in results) / total, |
| "has_connections": sum(r.has_connections for r in results) / total, |
| "nodes_valid": sum(r.nodes_valid for r in results) / total, |
| "has_thinking": sum(r.has_thinking for r in results) / total, |
| "thinking_structured": sum(r.thinking_structured for r in results) / total, |
| "overall_score": sum(r.score for r in results) / total, |
| } |
|
|
| print("\nMétriques globales:") |
| for metric, value in metrics.items(): |
| print(f" {metric}: {value:.1%}") |
|
|
| |
| print("\nMétriques par tâche:") |
| for task_type in sorted(task_counts.keys()): |
| task_results = [r for r in results if r.task_type == task_type] |
| if task_results: |
| task_score = sum(r.score for r in task_results) / len(task_results) |
| task_json = sum(r.valid_json for r in task_results) / len(task_results) |
| print(f" {task_type}: score={task_score:.1%}, json={task_json:.1%} (n={len(task_results)})") |
|
|
| |
| output = { |
| "model": model_path, |
| "num_samples": total, |
| "metrics": metrics, |
| "by_task": { |
| task: { |
| "count": len([r for r in results if r.task_type == task]), |
| "score": sum(r.score for r in results if r.task_type == task) / |
| max(1, len([r for r in results if r.task_type == task])), |
| } |
| for task in task_counts.keys() |
| }, |
| } |
|
|
| with open(output_file, "w") as f: |
| json.dump(output, f, indent=2) |
|
|
| print(f"\nRésultats sauvegardés dans: {output_file}") |
|
|
| return metrics |
|
|
|
|
| |
| |
| |
|
|
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser(description="Évaluation du modèle n8n Expert") |
| parser.add_argument("--model", type=str, required=True, help="Chemin du modèle à évaluer") |
| parser.add_argument("--samples", type=int, default=100, help="Nombre d'exemples à évaluer") |
| parser.add_argument("--output", type=str, default="eval_results.json", help="Fichier de sortie") |
|
|
| args = parser.parse_args() |
|
|
| run_evaluation( |
| model_path=args.model, |
| num_samples=args.samples, |
| output_file=args.output, |
| ) |
|
|