// Main worker: DiT + encoders + VAE on WebGPU. Spawns a dedicated LM worker // (isolated WASM heap) for autoregressive generation. import { AutoTokenizer } from "@huggingface/transformers"; import * as ort from "onnxruntime-web/webgpu"; const MODEL_REPO = "shreyask/ACE-Step-v1.5-ONNX"; const MODEL_REVISION = "bdabfb5684fd70fcc76f98cbb51bb9ebc47ee342"; const ONNX_BASE = `https://huggingface.co/${MODEL_REPO}/resolve/${MODEL_REVISION}/onnx`; const TEXT_TOKENIZER_REPO = "Qwen/Qwen3-Embedding-0.6B"; const SAMPLE_RATE = 48000; const LATENT_RATE = 25; const LATENT_CHANNELS = 64; const HIDDEN_SIZE = 2048; const POOL_WINDOW = 5; const FSQ_DIM = 6; const NUM_CODES = 64000; // 8-step turbo schedules (from ACE-Step) const SHIFT_TIMESTEPS_8 = { 1.0: [1.0, 0.875, 0.75, 0.625, 0.5, 0.375, 0.25, 0.125], 2.0: [1.0, 0.9333, 0.8571, 0.7692, 0.6667, 0.5455, 0.4, 0.2222], 3.0: [1.0, 0.9545, 0.9, 0.8333, 0.75, 0.6429, 0.5, 0.3], }; // Generate N-step shifted schedule matching MLX port: // timesteps = linspace(1.0, 0.001, N) // sigmas = shift * t / (1 + (shift-1) * t) function buildSchedule(numSteps, shift) { if (numSteps === 8 && SHIFT_TIMESTEPS_8[shift]) return SHIFT_TIMESTEPS_8[shift]; const sigmaMax = 1.0; const sigmaMin = 0.001; const schedule = []; for (let i = 0; i < numSteps; i++) { // linspace inclusive of both endpoints const t = sigmaMax + (sigmaMin - sigmaMax) * (i / (numSteps - 1)); const tShifted = (shift * t) / (1.0 + (shift - 1.0) * t); schedule.push(tShifted); } return schedule; } const CACHE_NAME = "ace-step-onnx-v12"; let textTokenizer = null; let sessions = {}; let silenceLatent = null; let fsqCodebooks = null; let fsqScales = null; let fsqProjectOutW = null; let fsqProjectOutB = null; let lmWorker = null; let lmLoaded = false; function post(type, data = {}) { self.postMessage({ type, ...data }); } async function fetchBuffer(url, label) { const cache = await caches.open(CACHE_NAME); const cached = await cache.match(url); if (cached) { post("progress", { label, loaded: 1, total: 1, percent: 100 }); return await cached.arrayBuffer(); } const response = await fetch(url); const total = parseInt(response.headers.get("content-length") || "0"); const reader = response.body.getReader(); const chunks = []; let loaded = 0; while (true) { const { done, value } = await reader.read(); if (done) break; chunks.push(value); loaded += value.length; if (total > 0) post("progress", { label, loaded, total, percent: (loaded / total) * 100 }); } const buffer = new Uint8Array(loaded); let offset = 0; for (const chunk of chunks) { buffer.set(chunk, offset); offset += chunk.length; } try { await cache.put(url, new Response(buffer.buffer.slice(0), { headers: { "Content-Type": "application/octet-stream" }, })); } catch (_) {} return buffer.buffer; } async function loadSession(name, filename, useUrlData = false, providers = ["webgpu"]) { post("status", { message: `Loading ${name}...` }); try { const modelBuffer = await fetchBuffer(`${ONNX_BASE}/${filename}`, `${name} graph`); if (useUrlData) { return await ort.InferenceSession.create(modelBuffer, { executionProviders: providers, externalData: [{ path: `${filename}.data`, data: `${ONNX_BASE}/${filename}.data` }], }); } const weightsBuffer = await fetchBuffer(`${ONNX_BASE}/${filename}.data`, `${name} weights`); return await ort.InferenceSession.create(modelBuffer, { executionProviders: providers, externalData: [{ path: `${filename}.data`, data: weightsBuffer }], }); } catch (err) { throw new Error(`Failed loading ${name}: ${err.message}`); } } function tensor(data, dims, type = "float32") { return new ort.Tensor(type, data, dims); } function tensorStats(name, data) { const arr = data instanceof Float32Array ? data : new Float32Array(data); let min = Infinity, max = -Infinity, sum = 0; for (let i = 0; i < arr.length; i++) { if (arr[i] < min) min = arr[i]; if (arr[i] > max) max = arr[i]; sum += arr[i]; } console.log(`[stats] ${name}: len=${arr.length} min=${min.toFixed(4)} max=${max.toFixed(4)} mean=${(sum / arr.length).toFixed(4)}`); } function randn(shape) { const size = shape.reduce((a, b) => a * b, 1); const data = new Float32Array(size); for (let i = 0; i < size; i += 2) { const u1 = Math.random(); const u2 = Math.random(); const r = Math.sqrt(-2 * Math.log(u1)); data[i] = r * Math.cos(2 * Math.PI * u2); if (i + 1 < size) data[i + 1] = r * Math.sin(2 * Math.PI * u2); } return data; } function packSequences(hidden1, mask1, hidden2, mask2, batchSize, dim) { const l1 = hidden1.length / (batchSize * dim); const l2 = hidden2.length / (batchSize * dim); const totalLen = l1 + l2; const packedHidden = new Float32Array(batchSize * totalLen * dim); const packedMask = new Float32Array(batchSize * totalLen); for (let b = 0; b < batchSize; b++) { const indices = []; for (let i = 0; i < l1; i++) indices.push({ src: 1, idx: i, mask: mask1[b * l1 + i] }); for (let i = 0; i < l2; i++) indices.push({ src: 2, idx: i, mask: mask2[b * l2 + i] }); indices.sort((a, c) => c.mask - a.mask); for (let pos = 0; pos < totalLen; pos++) { const entry = indices[pos]; const srcArray = entry.src === 1 ? hidden1 : hidden2; const srcLen = entry.src === 1 ? l1 : l2; const srcOffset = (b * srcLen + entry.idx) * dim; const dstOffset = (b * totalLen + pos) * dim; packedHidden.set(srcArray.slice(srcOffset, srcOffset + dim), dstOffset); packedMask[b * totalLen + pos] = entry.mask > 0 ? 1 : 0; } } return { hidden: packedHidden, mask: packedMask, seqLen: totalLen }; } function fsqLookup(indices, batchSize, seqLen) { const out = new Float32Array(batchSize * seqLen * HIDDEN_SIZE); for (let b = 0; b < batchSize; b++) { for (let t = 0; t < seqLen; t++) { const idx = indices[b * seqLen + t]; const codeOffset = idx * FSQ_DIM; const scaledCode = new Float32Array(FSQ_DIM); for (let d = 0; d < FSQ_DIM; d++) scaledCode[d] = fsqCodebooks[codeOffset + d] * fsqScales[d]; const outOffset = (b * seqLen + t) * HIDDEN_SIZE; for (let h = 0; h < HIDDEN_SIZE; h++) { let val = fsqProjectOutB[h]; for (let d = 0; d < FSQ_DIM; d++) val += scaledCode[d] * fsqProjectOutW[h * FSQ_DIM + d]; out[outOffset + h] = val; } } } return out; } // Spawn the LM worker and forward its status/progress messages up to the main thread function spawnLMWorker() { const worker = new Worker(new URL("./lm-worker.js", import.meta.url), { type: "module" }); worker.onmessage = (e) => { const { type, ...data } = e.data; if (type === "status" || type === "progress" || type === "error") { self.postMessage(e.data); // forward as-is } // "loaded" and "audio_codes" are handled by the promise-based callers below }; return worker; } function loadLMWorker() { return new Promise((resolve, reject) => { if (!lmWorker) lmWorker = spawnLMWorker(); const onMsg = (e) => { if (e.data.type === "loaded") { lmWorker.removeEventListener("message", onMsg); lmLoaded = true; resolve(); } else if (e.data.type === "error") { lmWorker.removeEventListener("message", onMsg); reject(new Error(e.data.message)); } }; lmWorker.addEventListener("message", onMsg); lmWorker.postMessage({ type: "load" }); }); } function generateAudioCodesViaLM({ caption, lyrics, duration, numLatentFrames }) { return new Promise((resolve, reject) => { const onMsg = (e) => { if (e.data.type === "audio_codes") { lmWorker.removeEventListener("message", onMsg); resolve(e.data); } else if (e.data.type === "error") { lmWorker.removeEventListener("message", onMsg); reject(new Error(e.data.message)); } }; lmWorker.addEventListener("message", onMsg); lmWorker.postMessage({ type: "generate", caption, lyrics, duration, numLatentFrames }); }); } async function loadModels() { ort.env.wasm.numThreads = 1; ort.env.wasm.simd = true; ort.env.wasm.proxy = false; console.log(`[models] ONNX revision ${MODEL_REVISION}`); post("status", { message: `Using ONNX revision ${MODEL_REVISION.slice(0, 7)}` }); post("status", { message: "Spawning LM worker..." }); // Kick off LM loading in parallel with main-worker model loads const lmLoadPromise = loadLMWorker(); post("status", { message: "Loading text tokenizer..." }); textTokenizer = await AutoTokenizer.from_pretrained(TEXT_TOKENIZER_REPO); sessions.embedTokens = await loadSession("Embed Tokens", "text_embed_tokens_fp16.onnx"); sessions.detokenizer = await loadSession("Detokenizer", "detokenizer.onnx"); // VAE on WASM — WebGPU produces constant output past ~1.5s for conv1d upsample chain sessions.vaeDecoder = await loadSession("VAE Decoder (CPU)", "vae_decoder_fp16.onnx", false, ["wasm"]); sessions.textEncoder = await loadSession("Text Encoder", "text_encoder_fp16.onnx", true); // FP32 condition_encoder — q4v2 had max_diff=13.92 vs PyTorch with real inputs, // degrading conditioning so badly that DiT output was garbled. FP32 is 2.4GB via URL. sessions.conditionEncoder = await loadSession("Condition Encoder (fp32)", "condition_encoder.onnx", true); // DEBUG: dit_decoder_fp16_v2 is the quality baseline (max_diff=0.021 per step). // dit_cached trades quality for speed (max_diff=0.074). Reverting while we diagnose // the ONNX-vs-MLX spectral gap — compounded drift over 8 steps matters here. sessions.ditDecoder = await loadSession("DiT Decoder (uncached)", "dit_decoder_fp16_v2.onnx", true); post("status", { message: "Loading auxiliary data..." }); const [cbBuf, scBuf, powBuf, pobBuf, silBuf] = await Promise.all([ fetchBuffer(`${ONNX_BASE}/fsq_codebooks.bin`, "codebooks"), fetchBuffer(`${ONNX_BASE}/fsq_scales.bin`, "scales"), fetchBuffer(`${ONNX_BASE}/fsq_project_out_weight.bin`, "proj_out_w"), fetchBuffer(`${ONNX_BASE}/fsq_project_out_bias.bin`, "proj_out_b"), fetchBuffer("/silence_latent.bin", "silence latent"), ]); fsqCodebooks = new Float32Array(cbBuf); fsqScales = new Float32Array(scBuf); fsqProjectOutW = new Float32Array(powBuf); fsqProjectOutB = new Float32Array(pobBuf); silenceLatent = new Float32Array(silBuf); post("status", { message: "Waiting for LM worker..." }); await lmLoadPromise; post("status", { message: "All models loaded!" }); post("loaded"); } function buildSFTPrompt(caption, metas) { const instruction = "Fill the audio semantic mask based on the given conditions:"; return `# Instruction\n${instruction}\n\n# Caption\n${caption}\n\n# Metas\n${metas}<|endoftext|>`; } async function encodeText(caption, metas) { const prompt = buildSFTPrompt(caption, metas); const encoded = textTokenizer(prompt, { padding: "max_length", max_length: 256, truncation: true }); const idsRaw = encoded.input_ids.data; const inputIds = idsRaw instanceof BigInt64Array ? idsRaw : new BigInt64Array(Array.from(idsRaw, BigInt)); const result = await sessions.textEncoder.run({ input_ids: tensor(inputIds, [1, 256], "int64") }); const projected = await sessions.textProjector.run({ text_hidden_states: result.hidden_states }); const maskRaw = encoded.attention_mask.data; const attentionMask = new Float32Array(maskRaw.length); for (let i = 0; i < maskRaw.length; i++) attentionMask[i] = Number(maskRaw[i]); return { hidden: projected.projected.data, mask: attentionMask, seqLen: 256 }; } async function encodeLyrics(lyrics, language = "en") { const fullText = `# Languages\n${language}\n\n# Lyric\n${lyrics}`; // max_length=2048 matches the original handler (conditioning_text.py) const encoded = textTokenizer(fullText, { padding: "max_length", max_length: 2048, truncation: true }); const idsRaw = encoded.input_ids.data; const inputIds = idsRaw instanceof BigInt64Array ? idsRaw : new BigInt64Array(Array.from(idsRaw, BigInt)); const seqLen = inputIds.length; const embedResult = await sessions.embedTokens.run({ input_ids: tensor(inputIds, [1, seqLen], "int64") }); const maskRaw = encoded.attention_mask.data; const attentionMask = new Float32Array(maskRaw.length); for (let i = 0; i < maskRaw.length; i++) attentionMask[i] = Number(maskRaw[i]); const lyricResult = await sessions.lyricEncoder.run({ inputs_embeds: embedResult.hidden_states, attention_mask: tensor(attentionMask, [1, seqLen]), }); return { hidden: lyricResult.hidden_states.data, mask: attentionMask, seqLen }; } async function encodeTimbre() { const silenceRef = silenceLatent.slice(0, 750 * LATENT_CHANNELS); const result = await sessions.timbreEncoder.run({ refer_audio: tensor(silenceRef, [1, 750, LATENT_CHANNELS]), }); const timbreHidden = new Float32Array(HIDDEN_SIZE); timbreHidden.set(result.timbre_embedding.data); return { hidden: timbreHidden, mask: new Float32Array([1.0]), seqLen: 1 }; } async function generateLMHints(caption, lyrics, numLatentFrames, duration) { const { codes, elapsed, tokenCount } = await generateAudioCodesViaLM({ caption, lyrics, duration, numLatentFrames }); post("status", { message: `LM: ${codes.length} codes from ${tokenCount} tokens in ${elapsed}s` }); if (codes.length === 0) { console.warn("[lm] No audio codes generated, returning silence"); return new Float32Array(numLatentFrames * LATENT_CHANNELS); } const numCodes5Hz = codes.length; post("status", { message: "FSQ codebook lookup..." }); const lmHints5Hz = fsqLookup(codes, 1, numCodes5Hz); tensorStats("lm_hints_5hz", lmHints5Hz); post("status", { message: "Detokenizing 5Hz → 25Hz..." }); const detokResult = await sessions.detokenizer.run({ quantized: tensor(lmHints5Hz, [1, numCodes5Hz, HIDDEN_SIZE]), }); const lmHints25HzRaw = detokResult.lm_hints_25hz.data; const rawLen = lmHints25HzRaw.length / LATENT_CHANNELS; tensorStats("lm_hints_25hz_raw", lmHints25HzRaw); // Pad with last frame (MLX port behavior) or truncate const lmHints25Hz = new Float32Array(numLatentFrames * LATENT_CHANNELS); if (rawLen >= numLatentFrames) { lmHints25Hz.set(lmHints25HzRaw.slice(0, numLatentFrames * LATENT_CHANNELS)); } else { lmHints25Hz.set(lmHints25HzRaw); // Repeat last frame to fill remaining const lastFrameStart = (rawLen - 1) * LATENT_CHANNELS; const lastFrame = lmHints25HzRaw.slice(lastFrameStart, lastFrameStart + LATENT_CHANNELS); for (let t = rawLen; t < numLatentFrames; t++) { lmHints25Hz.set(lastFrame, t * LATENT_CHANNELS); } console.log(`[hints] padded ${rawLen} → ${numLatentFrames} frames with last-frame replication`); } tensorStats("lm_hints_25hz_final", lmHints25Hz); return lmHints25Hz; } async function generateAudio({ caption, lyrics, duration, shift, numSteps = 8 }) { const totalStartTime = performance.now(); const filenameStamp = Date.now(); const batchSize = 1; const numLatentFrames = Math.round(duration * LATENT_RATE); const tSchedule = buildSchedule(numSteps, shift); const metas = `duration: ${duration}s`; // 1. Text → Qwen3 embedding (1024-dim hidden states, BEFORE projection) post("status", { message: "Encoding text..." }); const sftPrompt = buildSFTPrompt(caption, metas); const textEnc = textTokenizer(sftPrompt, { padding: "max_length", max_length: 256, truncation: true }); const textIdsRaw = textEnc.input_ids.data; const textIds = textIdsRaw instanceof BigInt64Array ? textIdsRaw : new BigInt64Array(Array.from(textIdsRaw, BigInt)); const textHiddenRes = await sessions.textEncoder.run({ input_ids: tensor(textIds, [1, 256], "int64") }); const textHidden = textHiddenRes.hidden_states; const textMaskRaw = textEnc.attention_mask.data; const textMask = new Float32Array(textMaskRaw.length); for (let i = 0; i < textMaskRaw.length; i++) textMask[i] = Number(textMaskRaw[i]); // 2. Lyric tokens → embed_tokens (1024-dim, passed into condition_encoder's lyric_encoder) post("status", { message: "Embedding lyrics..." }); const lyricFullText = `# Languages\nen\n\n# Lyric\n${lyrics}`; const lyricEnc = textTokenizer(lyricFullText, { padding: "max_length", max_length: 2048, truncation: true }); const lyricIdsRaw = lyricEnc.input_ids.data; const lyricIds = lyricIdsRaw instanceof BigInt64Array ? lyricIdsRaw : new BigInt64Array(Array.from(lyricIdsRaw, BigInt)); const lyricEmbRes = await sessions.embedTokens.run({ input_ids: tensor(lyricIds, [1, 2048], "int64") }); const lyricEmb = lyricEmbRes.hidden_states; const lyricMaskRaw = lyricEnc.attention_mask.data; const lyricMask = new Float32Array(lyricMaskRaw.length); for (let i = 0; i < lyricMaskRaw.length; i++) lyricMask[i] = Number(lyricMaskRaw[i]); // 3. LM hints (mandatory for turbo model) const lmHints25Hz = await generateLMHints(caption, lyrics, numLatentFrames, duration); // 4. Silence for ref audio (timbre) and src_latents const silenceRef = silenceLatent.slice(0, 750 * LATENT_CHANNELS); const srcLatents = new Float32Array(numLatentFrames * LATENT_CHANNELS); const chunkMasks = new Float32Array(numLatentFrames * LATENT_CHANNELS).fill(1.0); const isCovers = new Float32Array([1.0]); // force use of LM hints // 5. condition_encoder: does text_projector + lyric_encoder + timbre_encoder + pack_sequences + context_latents post("status", { message: "Running condition encoder..." }); const condResult = await sessions.conditionEncoder.run({ text_hidden_states: textHidden, text_attention_mask: tensor(textMask, [1, 256]), lyric_hidden_states: lyricEmb, lyric_attention_mask: tensor(lyricMask, [1, 2048]), refer_audio_acoustic_hidden_states_packed: tensor(silenceRef, [1, 750, LATENT_CHANNELS]), refer_audio_order_mask: tensor(new BigInt64Array([0n]), [1], "int64"), src_latents: tensor(srcLatents, [1, numLatentFrames, LATENT_CHANNELS]), chunk_masks: tensor(chunkMasks, [1, numLatentFrames, LATENT_CHANNELS]), is_covers: tensor(isCovers, [1]), precomputed_lm_hints_25hz: tensor(lmHints25Hz, [1, numLatentFrames, LATENT_CHANNELS]), }); const encoderHiddenStates = condResult.encoder_hidden_states; const contextLatentsTensor = condResult.context_latents; tensorStats("encoder_hidden_states", encoderHiddenStates.data); tensorStats("context_latents", contextLatentsTensor.data); post("status", { message: "Starting denoising..." }); let xt = randn([batchSize, numLatentFrames, LATENT_CHANNELS]); const startTime = performance.now(); for (let step = 0; step < tSchedule.length; step++) { const tCurr = tSchedule[step]; post("status", { message: `Denoising step ${step + 1}/${tSchedule.length}...` }); const timestepData = new Float32Array(batchSize).fill(tCurr); const result = await sessions.ditDecoder.run({ hidden_states: tensor(xt, [batchSize, numLatentFrames, LATENT_CHANNELS]), timestep: tensor(timestepData, [batchSize]), encoder_hidden_states: encoderHiddenStates, context_latents: contextLatentsTensor, }); const vt = result.velocity.data; if (step === tSchedule.length - 1) { for (let i = 0; i < xt.length; i++) xt[i] = xt[i] - vt[i] * tCurr; } else { const dt = tCurr - tSchedule[step + 1]; for (let i = 0; i < xt.length; i++) xt[i] = xt[i] - vt[i] * dt; } } const diffusionTime = ((performance.now() - startTime) / 1000).toFixed(2); tensorStats("final_latent", xt); // Per-frame variance check — detects if later frames are constant const perFrameVariance = new Float32Array(numLatentFrames); for (let t = 0; t < numLatentFrames; t++) { let mean = 0; for (let c = 0; c < LATENT_CHANNELS; c++) mean += xt[t * LATENT_CHANNELS + c]; mean /= LATENT_CHANNELS; let varSum = 0; for (let c = 0; c < LATENT_CHANNELS; c++) { const d = xt[t * LATENT_CHANNELS + c] - mean; varSum += d * d; } perFrameVariance[t] = varSum / LATENT_CHANNELS; } console.log("[perframe] variance samples:", Array.from(perFrameVariance.filter((_, i) => i % 25 === 0)).map(v => v.toFixed(3))); // Also check LM hints per-frame variance const hintsVar = new Float32Array(numLatentFrames); for (let t = 0; t < numLatentFrames; t++) { let mean = 0; for (let c = 0; c < LATENT_CHANNELS; c++) mean += lmHints25Hz[t * LATENT_CHANNELS + c]; mean /= LATENT_CHANNELS; let varSum = 0; for (let c = 0; c < LATENT_CHANNELS; c++) { const d = lmHints25Hz[t * LATENT_CHANNELS + c] - mean; varSum += d * d; } hintsVar[t] = varSum / LATENT_CHANNELS; } console.log("[hints var] samples:", Array.from(hintsVar.filter((_, i) => i % 25 === 0)).map(v => v.toFixed(3))); post("status", { message: "Decoding audio..." }); const latentsForVae = new Float32Array(batchSize * LATENT_CHANNELS * numLatentFrames); for (let t = 0; t < numLatentFrames; t++) { for (let c = 0; c < LATENT_CHANNELS; c++) { latentsForVae[c * numLatentFrames + t] = xt[t * LATENT_CHANNELS + c]; } } const vaeResult = await sessions.vaeDecoder.run({ latents: tensor(latentsForVae, [batchSize, LATENT_CHANNELS, numLatentFrames]), }); const waveform = vaeResult.waveform.data; tensorStats("waveform", waveform); masterWaveform(waveform, SAMPLE_RATE, 2); const wavBuffer = float32ToWav(waveform, SAMPLE_RATE, 2); // totalTime measures the whole pipeline (LM + encoders + diffusion + VAE), // not just the diffusion loop. diffusionTime is reported separately below. const totalTime = ((performance.now() - totalStartTime) / 1000).toFixed(2); post("audio", { wavBuffer, duration, diffusionTime, totalTime, filenameStamp }, [wavBuffer]); } function measureAudio(samples) { let peak = 0; let sumSq = 0; for (let i = 0; i < samples.length; i++) { const v = samples[i]; const abs = Math.abs(v); if (abs > peak) peak = abs; sumSq += v * v; } return { peak, rms: Math.sqrt(sumSq / Math.max(1, samples.length)) }; } function goertzelPower(data, sampleRate, freq) { const omega = 2 * Math.PI * freq / sampleRate; const coeff = 2 * Math.cos(omega); let s0 = 0, s1 = 0, s2 = 0; for (let i = 0; i < data.length; i++) { s0 = data[i] + coeff * s1 - s2; s2 = s1; s1 = s0; } return s1 * s1 + s2 * s2 - coeff * s1 * s2; } function detectDronePeaks(samples, sampleRate, channels) { const numSamples = samples.length / channels; const step = Math.max(1, Math.floor(sampleRate / 4000)); const downsampleRate = sampleRate / step; const downsampledLength = Math.floor(numSamples / step); if (downsampledLength < 1024) return []; const mono = new Float32Array(downsampledLength); let mean = 0; for (let i = 0; i < downsampledLength; i++) { const src = i * step; let v = 0; for (let ch = 0; ch < channels; ch++) v += samples[ch * numSamples + src]; v /= channels; mono[i] = v; mean += v; } mean /= downsampledLength; for (let i = 0; i < mono.length; i++) mono[i] -= mean; const bins = []; for (let freq = 250; freq <= 950; freq += 12.5) { bins.push({ freq, power: goertzelPower(mono, downsampleRate, freq) }); } const sortedPowers = bins.map((bin) => bin.power).sort((a, b) => a - b); const median = sortedPowers[Math.floor(sortedPowers.length / 2)] + 1e-12; bins.sort((a, b) => b.power - a.power); const peaks = []; for (const bin of bins) { const score = bin.power / median; if (score < 12) break; if (peaks.every((peak) => Math.abs(peak.freq - bin.freq) >= 50)) { peaks.push({ freq: bin.freq, score }); if (peaks.length >= 2) break; } } return peaks; } function applyNotch(samples, sampleRate, channels, freq, q = 20, depth = 0.45) { const numSamples = samples.length / channels; const w0 = 2 * Math.PI * freq / sampleRate; const cos = Math.cos(w0); const alpha = Math.sin(w0) / (2 * q); const a0 = 1 + alpha; const b0 = 1 / a0; const b1 = (-2 * cos) / a0; const b2 = 1 / a0; const a1 = (-2 * cos) / a0; const a2 = (1 - alpha) / a0; for (let ch = 0; ch < channels; ch++) { const offset = ch * numSamples; let x1 = 0, x2 = 0, y1 = 0, y2 = 0; for (let i = 0; i < numSamples; i++) { const x0 = samples[offset + i]; const y0 = b0 * x0 + b1 * x1 + b2 * x2 - a1 * y1 - a2 * y2; samples[offset + i] = x0 * (1 - depth) + y0 * depth; x2 = x1; x1 = x0; y2 = y1; y1 = y0; } } } function masterWaveform(samples, sampleRate, channels) { const before = measureAudio(samples); if (before.peak <= 0.001) return; const dronePeaks = detectDronePeaks(samples, sampleRate, channels); for (const peak of dronePeaks) applyNotch(samples, sampleRate, channels, peak.freq); const afterEq = measureAudio(samples); const targetRms = 0.085; const maxPeak = 0.891; const maxGain = 12.0; const gain = Math.min( maxGain, targetRms / Math.max(afterEq.rms, 1e-6), maxPeak / Math.max(afterEq.peak, 1e-6), ); for (let i = 0; i < samples.length; i++) samples[i] *= gain; const after = measureAudio(samples); const peakText = dronePeaks.map((peak) => `${peak.freq.toFixed(1)}Hz/${peak.score.toFixed(0)}x`).join(", ") || "none"; console.log( `[master] rawPeak=${before.peak.toFixed(4)} rawRms=${before.rms.toFixed(4)} ` + `dronePeaks=${peakText} gain=${gain.toFixed(2)}x peak=${after.peak.toFixed(4)} rms=${after.rms.toFixed(4)}`, ); } function float32ToWav(samples, sampleRate, channels = 2) { const numSamples = samples.length / channels; const bitsPerSample = 16; const blockAlign = channels * (bitsPerSample / 8); const byteRate = sampleRate * blockAlign; const dataSize = numSamples * blockAlign; const buffer = new ArrayBuffer(44 + dataSize); const view = new DataView(buffer); const w = (o, s) => { for (let i = 0; i < s.length; i++) view.setUint8(o + i, s.charCodeAt(i)); }; w(0, "RIFF"); view.setUint32(4, 36 + dataSize, true); w(8, "WAVE"); w(12, "fmt "); view.setUint32(16, 16, true); view.setUint16(20, 1, true); view.setUint16(22, channels, true); view.setUint32(24, sampleRate, true); view.setUint32(28, byteRate, true); view.setUint16(32, blockAlign, true); view.setUint16(34, bitsPerSample, true); w(36, "data"); view.setUint32(40, dataSize, true); let offset = 44; for (let i = 0; i < numSamples; i++) { for (let ch = 0; ch < channels; ch++) { const sample = Math.max(-1, Math.min(1, samples[ch * numSamples + i])); view.setInt16(offset, sample * 32767, true); offset += 2; } } return buffer; } self.onmessage = async (e) => { const { type, ...data } = e.data; try { if (type === "load") await loadModels(); else if (type === "generate") await generateAudio(data); } catch (err) { post("error", { message: err.message, stack: err.stack }); } };