|
|
calibration_set: |
|
|
_templates: |
|
|
programming_languages: &programming_languages "Solve the following problem using {{ ['Zephyr', 'Prolog', 'Cobol', 'Apex', 'Crystal', 'Fortran', 'Nim', 'Delphi', 'Ada', 'Objective-C', 'VBA', 'Perl', 'Groovy', 'MATLAB', 'Solidity', 'Visual Basic', 'OCaml', 'Erlang', 'Julia', 'Lisp', 'F#', 'Clojure', 'GDScript', 'Scala', 'R', 'Haskell', 'Ruby', 'Elixir', 'Lua', 'Zig', 'Dart', 'Swift', 'Metal', 'PowerShell', 'PHP', 'Kotlin', 'C', 'Java', 'C++', 'C#', 'Bash/Shell', 'Go', 'Rust', 'TypeScript', 'HTML/CSS', 'SQL', 'JavaScript', 'Python', 'Lean', 'Coq', 'Pony', 'D', 'Racket', 'Haxe', 'x86-64 ASM', 'ARM-64 ASM', 'LLVM IR', 'GLSL', 'CUDA', 'Vulkan'][hash(row|string) % 60] }}\n***\n" |
|
|
spoken_languages: &spoken_languages "Answer in {{ ['Arabic', 'Chinese', 'French', 'German', 'Hebrew', 'Hindi', 'Japanese', 'Korean', 'Portuguese', 'Russian', 'Spanish', 'Turkish'][hash(row|string) % 12] }}\n***\n" |
|
|
max_seq_length: 8192 |
|
|
shuffle: true |
|
|
seed: 42 |
|
|
datasets: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
- dataset: HuggingFaceH4/ultrachat_200k |
|
|
columns: [messages] |
|
|
split: train_sft |
|
|
formatter: chat_completion |
|
|
num_samples: 8 |
|
|
streaming: true |
|
|
|
|
|
- dataset: databricks/databricks-dolly-15k |
|
|
split: train |
|
|
columns: [instruction, response] |
|
|
formatter: prompt_answer |
|
|
num_samples: 8 |
|
|
|
|
|
- dataset: neuralmagic/calibration |
|
|
subset: LLM |
|
|
split: train |
|
|
columns: [messages] |
|
|
formatter: chat_completion |
|
|
num_samples: 8 |
|
|
|
|
|
|
|
|
|
|
|
- dataset: HuggingFaceH4/no_robots |
|
|
split: train |
|
|
columns: [messages] |
|
|
formatter: chat_completion |
|
|
num_samples: 2 |
|
|
|
|
|
- dataset: nvidia/HelpSteer |
|
|
split: train |
|
|
columns: [prompt, response] |
|
|
formatter: prompt_answer |
|
|
num_samples: 2 |
|
|
streaming: true |
|
|
|
|
|
- dataset: garage-bAInd/Open-Platypus |
|
|
split: train |
|
|
columns: [instruction, output] |
|
|
formatter: prompt_answer |
|
|
num_samples: 2 |
|
|
|
|
|
- dataset: PJMixers/grimulkan_physical-reasoning-ShareGPT |
|
|
split: train |
|
|
columns: [conversations] |
|
|
formatter: sharegpt |
|
|
num_samples: 4 |
|
|
|
|
|
- dataset: PJMixers/grimulkan_theory-of-mind-ShareGPT |
|
|
split: train |
|
|
columns: [conversations] |
|
|
formatter: sharegpt |
|
|
num_samples: 4 |
|
|
|
|
|
|
|
|
|
|
|
- dataset: HuggingFaceH4/Multilingual-Thinking |
|
|
split: train |
|
|
columns: [user] |
|
|
formatter: raw_text |
|
|
num_samples: 32 |
|
|
formatter_params: |
|
|
prefix: *spoken_languages |
|
|
|
|
|
- dataset: ServiceNow-AI/M2Lingual |
|
|
subset: full_data |
|
|
split: train |
|
|
columns: [conversation] |
|
|
formatter: chat_completion |
|
|
num_samples: 4 |
|
|
streaming: true |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
- dataset: interstellarninja/hermes_reasoning_tool_use |
|
|
split: train |
|
|
columns: [conversations] |
|
|
formatter: sharegpt |
|
|
num_samples: 100 |
|
|
streaming: true |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
- dataset: deepmind/code_contests |
|
|
split: train |
|
|
columns: [name] |
|
|
formatter: deepmind_code_contests |
|
|
num_samples: 50 |
|
|
streaming: true |
|
|
|
|
|
- dataset: dh02391735/stackoverflow-kubernetes-questions |
|
|
split: train |
|
|
columns: [instruction] |
|
|
formatter: raw_text |
|
|
num_samples: 8 |
|
|
streaming: true |
|
|
|
|
|
- dataset: diversoailab/humaneval-rust |
|
|
split: train |
|
|
columns: [prompt] |
|
|
formatter: raw_text |
|
|
num_samples: 100 |
|
|
formatter_params: |
|
|
prefix: *programming_languages |
|
|
|
|
|
- dataset: ammarnasr/the-stack-rust-clean |
|
|
split: train |
|
|
columns: [content] |
|
|
formatter: raw_text |
|
|
num_samples: 8 |
|
|
streaming: true |
|
|
formatter_params: |
|
|
prefix: "Explain this code and comment it for a junior dev.\n***\n" |
|
|
|
|
|
- dataset: CSJianYang/CodeArena |
|
|
split: test |
|
|
columns: [messages] |
|
|
formatter: chat_completion |
|
|
num_samples: 8 |
|
|
|
|
|
- dataset: nvidia/OpenCodeInstruct |
|
|
split: train |
|
|
columns: [input, output] |
|
|
formatter: prompt_answer |
|
|
num_samples: 8 |
|
|
streaming: true |
|
|
|
|
|
- dataset: nvidia/Llama-Nemotron-Post-Training-Dataset |
|
|
split: code |
|
|
columns: [input] |
|
|
formatter: chat_completion |
|
|
num_samples: 8 |
|
|
streaming: true |
|
|
|
|
|
- dataset: nvidia/Nemotron-Competitive-Programming-v1 |
|
|
split: competitive_coding_cpp_part00 |
|
|
columns: [messages] |
|
|
formatter: chat_completion |
|
|
num_samples: 8 |
|
|
streaming: true |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
- dataset: rombodawg/code_bagel_hermes-2.5 |
|
|
split: train |
|
|
columns: [input, output] |
|
|
formatter: prompt_answer |
|
|
num_samples: 100 |
|
|
streaming: true |
|
|
|
|
|
- dataset: MathArena/project_euler |
|
|
split: train |
|
|
columns: [problem] |
|
|
formatter: raw_text |
|
|
num_samples: 30 |
|
|
formatter_params: |
|
|
prefix: *programming_languages |
|
|
|
|
|
|
|
|
- dataset: nvidia/Llama-Nemotron-Post-Training-Dataset |
|
|
split: math |
|
|
columns: [input] |
|
|
formatter: chat_completion |
|
|
num_samples: 4 |
|
|
streaming: true |
|
|
|
|
|
- dataset: nvidia/Nemotron-Math-Proofs-v1 |
|
|
split: lean |
|
|
columns: [formal_statement] |
|
|
formatter: raw_text |
|
|
num_samples: 4 |
|
|
streaming: true |
|
|
formatter_params: |
|
|
prefix: "Can you improve, document and add comment to this Lean proof for a non-mathematician?\n***\n" |
|
|
|
|
|
- dataset: nvidia/OpenMathInstruct-2 |
|
|
split: train |
|
|
columns: [problem, generated_solution] |
|
|
formatter: prompt_answer |
|
|
num_samples: 4 |
|
|
streaming: true |
|
|
|
|
|
|
|
|
- dataset: nvidia/Llama-Nemotron-Post-Training-Dataset |
|
|
split: science |
|
|
columns: [input] |
|
|
formatter: chat_completion |
|
|
num_samples: 4 |
|
|
streaming: true |
|
|
|
|
|
- dataset: nvidia/OpenScienceReasoning-2 |
|
|
split: train |
|
|
columns: [input, output] |
|
|
formatter: prompt_answer |
|
|
num_samples: 8 |
|
|
streaming: true |
|
|
|
|
|
- dataset: MegaScience/MegaScience |
|
|
split: train |
|
|
columns: [question, answer] |
|
|
formatter: prompt_answer |
|
|
num_samples: 4 |
|
|
streaming: true |
|
|
|
|
|
|
|
|
- dataset: OpenMed/Medical-Reasoning-SFT-GPT-OSS-120B |
|
|
split: train |
|
|
columns: [messages] |
|
|
formatter: chat_completion |
|
|
num_samples: 4 |
|
|
streaming: true |
|
|
|
|
|
- dataset: ccdv/pubmed-summarization |
|
|
subset: section |
|
|
split: train |
|
|
columns: [article] |
|
|
formatter: raw_text |
|
|
num_samples: 4 |
|
|
streaming: true |
|
|
formatter_params: |
|
|
prefix: "Summarize this:\n***\n" |
|
|
|
|
|
|
|
|
- dataset: gbharti/finance-alpaca |
|
|
split: train |
|
|
columns: [instruction, output] |
|
|
formatter: prompt_answer |
|
|
num_samples: 4 |
|
|
|
|
|
- dataset: vladlen32230/summarization-yahoo-stock-finance-article-text |
|
|
split: train |
|
|
columns: [text] |
|
|
formatter: raw_text |
|
|
num_samples: 4 |
|
|
formatter_params: |
|
|
prefix: "Summarize this:\n***\n" |
|
|
|
|
|
|
|
|
- dataset: fka/awesome-chatgpt-prompts |
|
|
split: train |
|
|
columns: [prompt] |
|
|
formatter: raw_text |
|
|
num_samples: 8 |
|
|
|
|
|
- dataset: theoldmandthesea/17k_business_book |
|
|
split: train |
|
|
columns: [question, answer] |
|
|
formatter: prompt_answer |
|
|
num_samples: 8 |
|
|
|
|
|
|
|
|
- dataset: ruggsea/stanford-encyclopedia-of-philosophy_instruct |
|
|
split: train |
|
|
columns: [question, answer] |
|
|
formatter: prompt_answer |
|
|
num_samples: 2 |
|
|
streaming: true |
|
|
|
|
|
- dataset: mlfoundations-dev/stackexchange_philosophy |
|
|
split: train |
|
|
columns: [conversations] |
|
|
formatter: sharegpt |
|
|
num_samples: 2 |
|
|
|
|
|
- dataset: FreedomIntelligence/SocraticChat |
|
|
split: train |
|
|
columns: [conversations] |
|
|
formatter: sharegpt |
|
|
num_samples: 4 |
|
|
streaming: true |
|
|
|
|
|
|
|
|
- dataset: Gryphe/Opus-WritingPrompts |
|
|
split: train |
|
|
columns: [conversations] |
|
|
formatter: sharegpt |
|
|
num_samples: 2 |
|
|
|
|
|
- dataset: anthracite-org/nopm_claude_writing_fixed |
|
|
split: train |
|
|
columns: [conversations] |
|
|
formatter: sharegpt |
|
|
num_samples: 2 |
|
|
|
|
|
- dataset: zerofata/Roleplay-Anime-Characters |
|
|
split: train |
|
|
columns: [messages] |
|
|
formatter: chat_completion |
|
|
num_samples: 1 |
|
|
|
|
|
- dataset: zerofata/Instruct-Anime |
|
|
split: train |
|
|
columns: [messages] |
|
|
formatter: chat_completion |
|
|
num_samples: 1 |
|
|
|
|
|
- dataset: zerofata/Instruct-Anime-CreativeWriting |
|
|
split: train |
|
|
columns: [messages] |
|
|
formatter: chat_completion |
|
|
num_samples: 1 |
|
|
|
|
|
- dataset: sam-paech/gutenberg3-generalfiction-scifi-fantasy-romance-adventure-dpo |
|
|
split: train |
|
|
columns: [chosen] |
|
|
formatter: chat_completion |
|
|
num_samples: 2 |
|
|
|
|
|
- dataset: PocketDoc/Dans-Prosemaxx-Adventure |
|
|
split: train |
|
|
columns: [conversations] |
|
|
formatter: sharegpt |
|
|
num_samples: 2 |
|
|
|
|
|
- dataset: anthracite-org/stheno-filtered-v1.1 |
|
|
split: train |
|
|
columns: [conversations] |
|
|
formatter: sharegpt |
|
|
num_samples: 2 |
|
|
streaming: true |
|
|
|
|
|
|
|
|
- dataset: KaraKaraWitch/TvTroper-2025 |
|
|
split: train |
|
|
columns: [article] |
|
|
formatter: raw_text |
|
|
num_samples: 2 |
|
|
streaming: true |
|
|
formatter_params: |
|
|
prefix: "Explain this trope like I'm your grandmother\n***\n" |
|
|
|
|
|
|
|
|
- dataset: AquaV/US-Army-Survival-Sharegpt |
|
|
split: train |
|
|
columns: [conversations] |
|
|
formatter: sharegpt |
|
|
num_samples: 1 |
|
|
|
|
|
- dataset: AquaV/Interrogation-Sharegpt |
|
|
split: train |
|
|
columns: [conversations] |
|
|
formatter: sharegpt |
|
|
num_samples: 1 |
|
|
|
|
|
- dataset: AquaV/Multi-Environment-Operations-Sharegpt |
|
|
split: train |
|
|
columns: [conversations] |
|
|
formatter: sharegpt |
|
|
num_samples: 1 |
|
|
|
|
|
- dataset: AquaV/Resistance-Sharegpt |
|
|
split: train |
|
|
columns: [conversations] |
|
|
formatter: sharegpt |
|
|
num_samples: 1 |
|
|
|
|
|
|
|
|
- dataset: PocketDoc/Dans-Kinomaxx-VanillaBackrooms |
|
|
split: train |
|
|
columns: [conversations] |
|
|
formatter: sharegpt |
|
|
num_samples: 1 |
|
|
|