diff --git a/server/api/scan.ts b/server/api/scan.ts index 46d63cc..76cf188 100644 --- a/server/api/scan.ts +++ b/server/api/scan.ts @@ -218,9 +218,16 @@ async function runScan(limit: number | null = null): Promise { const tmdbId = providerIds.Tmdb ?? null; const tvdbId = providerIds.Tvdb ?? null; - let origLang: string | null = extractOriginalLanguage(jellyfinItem); - let origLangSource = "jellyfin"; + // Jellyfin has no real original_language field; extractOriginalLanguage + // guesses from the first/default audio stream. That's a DUB MAGNET — + // files uploaded from non-English regions often have a local dub first, + // so the "original" comes out as Turkish, German, etc. We record it + // as a starting point but treat it as unverified. + const jellyfinGuess = extractOriginalLanguage(jellyfinItem); + let origLang: string | null = jellyfinGuess; + let origLangSource = jellyfinGuess ? "jellyfin" : null; let needsReview = origLang ? 0 : 1; + let authoritative = false; // set when Radarr/Sonarr answers if (jellyfinItem.Type === "Movie" && radarrEnabled && (tmdbId || imdbId)) { const lang = await radarrLang( @@ -231,6 +238,7 @@ async function runScan(limit: number | null = null): Promise { if (origLang && normalizeLanguage(origLang) !== normalizeLanguage(lang)) needsReview = 1; origLang = lang; origLangSource = "radarr"; + authoritative = true; } } @@ -240,17 +248,21 @@ async function runScan(limit: number | null = null): Promise { if (origLang && normalizeLanguage(origLang) !== normalizeLanguage(lang)) needsReview = 1; origLang = lang; origLangSource = "sonarr"; + authoritative = true; } } - // Compute confidence from source agreement + // High confidence requires an authoritative source (Radarr/Sonarr) and + // no conflict. A Jellyfin-only guess is ALWAYS low confidence and gets + // flagged for review — that's how 8 Mile landed as "Turkish": default + // audio was a Turkish dub, Radarr wasn't available or didn't have the + // movie, and the guess got a green 'high' badge it never earned. let confidence: "high" | "low" = "low"; - if (!origLang) { - confidence = "low"; // unknown language - } else if (needsReview) { - confidence = "low"; // sources disagree - } else { - confidence = "high"; // language known, no conflicts + if (origLang && authoritative && !needsReview) { + confidence = "high"; + } else if (origLang && !authoritative) { + // Jellyfin guess only — surface it for manual review. + needsReview = 1; } upsertItem.run( diff --git a/server/db/index.ts b/server/db/index.ts index 861e620..8d9ec5c 100644 --- a/server/db/index.ts +++ b/server/db/index.ts @@ -88,6 +88,22 @@ export function getDb(): Database { } catch { /* already exists */ } + + // Data migration (idempotent): any plan whose original_language came from + // the Jellyfin heuristic is downgraded to low confidence and flagged for + // review. Previous scans marked these 'high' when no other source + // disagreed — but Jellyfin's guess isn't authoritative, so it shouldn't + // have been green in the first place. Only touch pending/error plans so + // already-processed work isn't clobbered. + _db.exec(` + UPDATE media_items SET needs_review = 1 + WHERE orig_lang_source = 'jellyfin' AND original_language IS NOT NULL AND needs_review = 0; + UPDATE review_plans SET confidence = 'low' + WHERE confidence = 'high' + AND status IN ('pending', 'error') + AND item_id IN (SELECT id FROM media_items WHERE orig_lang_source = 'jellyfin'); + `); + seedDefaults(_db); return _db; diff --git a/server/services/__tests__/jellyfin.test.ts b/server/services/__tests__/jellyfin.test.ts new file mode 100644 index 0000000..ec68783 --- /dev/null +++ b/server/services/__tests__/jellyfin.test.ts @@ -0,0 +1,46 @@ +import { describe, expect, test } from "bun:test"; +import type { JellyfinItem, JellyfinMediaStream } from "../../types"; +import { extractOriginalLanguage } from "../jellyfin"; + +function audio(o: Partial): JellyfinMediaStream { + return { Type: "Audio", Index: 0, ...o }; +} + +function item(streams: JellyfinMediaStream[]): JellyfinItem { + return { Id: "x", Type: "Movie", Name: "Test", MediaStreams: streams }; +} + +describe("extractOriginalLanguage — Jellyfin heuristic", () => { + test("returns null when there are no audio streams", () => { + expect(extractOriginalLanguage(item([{ Type: "Video", Index: 0 }]))).toBe(null); + }); + + test("uses the only audio track when there is just one", () => { + expect(extractOriginalLanguage(item([audio({ Language: "eng" })]))).toBe("eng"); + }); + + test("prefers the IsDefault audio track over position", () => { + // 8 Mile regression: Turkish dub first, English default further down. + // Old heuristic took the first track and labelled the movie Turkish. + const streams = [audio({ Index: 0, Language: "tur" }), audio({ Index: 1, Language: "eng", IsDefault: true })]; + expect(extractOriginalLanguage(item(streams))).toBe("eng"); + }); + + test("skips a dub even when it is the default", () => { + const streams = [ + audio({ Index: 0, Language: "tur", IsDefault: true, Title: "Turkish Dub" }), + audio({ Index: 1, Language: "eng" }), + ]; + expect(extractOriginalLanguage(item(streams))).toBe("eng"); + }); + + test("falls back to first audio track when every track looks like a dub", () => { + const streams = [ + audio({ Index: 0, Language: "tur", Title: "Turkish Dub" }), + audio({ Index: 1, Language: "deu", Title: "German Dub" }), + ]; + // No good candidate — returns the first audio so there's *some* guess, + // but scan.ts is responsible for marking this needs_review. + expect(extractOriginalLanguage(item(streams))).toBe("tur"); + }); +}); diff --git a/server/services/jellyfin.ts b/server/services/jellyfin.ts index fe01afd..c217b53 100644 --- a/server/services/jellyfin.ts +++ b/server/services/jellyfin.ts @@ -167,13 +167,26 @@ export async function refreshItem(cfg: JellyfinConfig, jellyfinId: string, timeo // Timeout reached — proceed anyway (refresh may still complete in background) } -/** Map a Jellyfin item to our normalized language code (ISO 639-2). */ +/** Case-insensitive hints that a track is a dub / commentary, not the original. */ +const DUB_TITLE_HINTS = /(dub|dubb|synchro|commentary|director)/i; + +/** + * Jellyfin has no real original_language field, so we guess from audio streams. + * This is the notorious "8 Mile got labelled Turkish" heuristic — guard it: + * 1. Prefer IsDefault audio when available (Jellyfin sets this from the file's + * default disposition flag; uploaders usually set it to the original). + * 2. Skip tracks whose title screams "dub" / "commentary". + * 3. Fall back to the first non-dub audio track, then first audio track. + * The caller must still treat any jellyfin-sourced value as unverified — this + * just makes the guess less wrong. The trustworthy answer comes from Radarr/Sonarr. + */ export function extractOriginalLanguage(item: JellyfinItem): string | null { - // Jellyfin doesn't have a direct "original_language" field like TMDb. - // The best proxy is the language of the first audio stream. if (!item.MediaStreams) return null; - const firstAudio = item.MediaStreams.find((s) => s.Type === "Audio"); - return firstAudio?.Language ? normalizeLanguage(firstAudio.Language) : null; + const audio = item.MediaStreams.filter((s) => s.Type === "Audio"); + if (audio.length === 0) return null; + const notDub = (s: JellyfinMediaStream) => !s.Title || !DUB_TITLE_HINTS.test(s.Title); + const pick = audio.find((s) => s.IsDefault && notDub(s)) ?? audio.find(notDub) ?? audio[0]; + return pick.Language ? normalizeLanguage(pick.Language) : null; } /** Map a Jellyfin MediaStream to our internal MediaStream shape (sans id/item_id). */