fix '8 Mile is Turkish': jellyfin guesses never earn high confidence

Two bugs compounded: 1. extractOriginalLanguage() in jellyfin.ts picked the FIRST audio stream's language and called it 'original'. Files sourced from non-English regions often have a local dub as track 0, so 8 Mile with a Turkish dub first got labelled Turkish. 2. scan.ts promoted any single-source answer to confidence='high' — even the pure Jellyfin guess, as long as no second source (Radarr/Sonarr) contradicted it. Jellyfin's dub-magnet guess should never be green. Fixes: - extractOriginalLanguage now prefers the IsDefault audio track and skips tracks whose title shouts 'dub' / 'commentary' / 'director'. Still a heuristic, but much less wrong. Fallback to the first track when every candidate looks like a dub so we have *something* to flag. - scan.ts: high confidence requires an authoritative source (Radarr/Sonarr) with no conflict. A Jellyfin-only answer is always low confidence AND gets needs_review=1 so it surfaces in the pipeline for manual override. - Data migration (idempotent): downgrade existing plans backed only by the Jellyfin heuristic to low confidence and mark needs_review=1, so users don't have to rescan to benefit. - New server/services/__tests__/jellyfin.test.ts covers the default-track preference and dub-skip behavior.
2026-04-13 11:39:59 +02:00
parent e3b241bef3
commit 50d3e50280
4 changed files with 101 additions and 14 deletions
--- a/server/api/scan.ts
+++ b/server/api/scan.ts
@@ -218,9 +218,16 @@ async function runScan(limit: number | null = null): Promise<void> {
 			const tmdbId = providerIds.Tmdb ?? null;
 			const tvdbId = providerIds.Tvdb ?? null;

-			let origLang: string | null = extractOriginalLanguage(jellyfinItem);
-			let origLangSource = "jellyfin";
+			// Jellyfin has no real original_language field; extractOriginalLanguage
+			// guesses from the first/default audio stream. That's a DUB MAGNET —
+			// files uploaded from non-English regions often have a local dub first,
+			// so the "original" comes out as Turkish, German, etc. We record it
+			// as a starting point but treat it as unverified.
+			const jellyfinGuess = extractOriginalLanguage(jellyfinItem);
+			let origLang: string | null = jellyfinGuess;
+			let origLangSource = jellyfinGuess ? "jellyfin" : null;
 			let needsReview = origLang ? 0 : 1;
+			let authoritative = false; // set when Radarr/Sonarr answers

 			if (jellyfinItem.Type === "Movie" && radarrEnabled && (tmdbId || imdbId)) {
 				const lang = await radarrLang(
@@ -231,6 +238,7 @@ async function runScan(limit: number | null = null): Promise<void> {
 					if (origLang && normalizeLanguage(origLang) !== normalizeLanguage(lang)) needsReview = 1;
 					origLang = lang;
 					origLangSource = "radarr";
+					authoritative = true;
 				}
 			}

@@ -240,17 +248,21 @@ async function runScan(limit: number | null = null): Promise<void> {
 					if (origLang && normalizeLanguage(origLang) !== normalizeLanguage(lang)) needsReview = 1;
 					origLang = lang;
 					origLangSource = "sonarr";
+					authoritative = true;
 				}
 			}

-			// Compute confidence from source agreement
+			// High confidence requires an authoritative source (Radarr/Sonarr) and
+			// no conflict. A Jellyfin-only guess is ALWAYS low confidence and gets
+			// flagged for review — that's how 8 Mile landed as "Turkish": default
+			// audio was a Turkish dub, Radarr wasn't available or didn't have the
+			// movie, and the guess got a green 'high' badge it never earned.
 			let confidence: "high" | "low" = "low";
-			if (!origLang) {
-				confidence = "low"; // unknown language
-			} else if (needsReview) {
-				confidence = "low"; // sources disagree
-			} else {
-				confidence = "high"; // language known, no conflicts
+			if (origLang && authoritative && !needsReview) {
+				confidence = "high";
+			} else if (origLang && !authoritative) {
+				// Jellyfin guess only — surface it for manual review.
+				needsReview = 1;
 			}

 			upsertItem.run(