From 020f3dacf1368940b13e026afae113a4daf63db6 Mon Sep 17 00:00:00 2001
From: xRangerDE <xRangerDE@24-music.de>
Date: Sun, 3 May 2026 15:54:40 +0200
Subject: [PATCH] harden: GQL retry on transient errors + consolidate shutdown
 cleanup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two server-side changes touching different paths.

1. fetchPublicTwitchGql now retries on transient HTTP (408/429/5xx) and
   network-layer failures (no response). Up to 3 attempts with
   exponential backoff + jitter (400ms * 2^(n-1)). The previous
   catch (e) { return null; } swallowed network blips on the public
   fallback path, which is what every user without a client_id hits
   on each VOD list load — a single TCP RST produced an empty list
   and the user had to click refresh. GraphQL errors[] are still
   returned without retry (application-level query rejections).
   Recovery is logged via appendDebugLog so we can later see whether
   the retries actually pay off in production.

2. shutdownCleanup() consolidates window-all-closed and before-quit.
   The two handlers ran nearly identical cleanup blocks but had
   drifted: only window-all-closed killed children and was
   platform-aware. The helper kills activeDownloads + activeClipProcesses
   + currentEditorProcess with try/catch, persists config + queue,
   then stops timers (debug-log flush moved AFTER persistence so any
   save error reaches the log before the timer is gone). An idempotent
   shutdownCleanupDone flag makes a follow-on event a no-op.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/main.ts | 139 +++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 104 insertions(+), 35 deletions(-)
diff --git a/src/main.ts b/src/main.ts
index 501763b..a0787de 100644
--- a/src/main.ts
+++ b/src/main.ts
@@ -1598,30 +1598,78 @@ function formatTwitchDurationFromSeconds(totalSeconds: number): string {
     return `${s}s`;
 }
 
-async function fetchPublicTwitchGql<T>(query: string, variables: Record<string, unknown>): Promise<T | null> {
-    try {
-        const response = await axios.post<{ data?: T; errors?: Array<{ message: string }> }>(
-            'https://gql.twitch.tv/gql',
-            { query, variables },
-            {
-                headers: {
-                    'Client-ID': TWITCH_WEB_CLIENT_ID,
-                    'Content-Type': 'application/json'
-                },
-                timeout: API_TIMEOUT
-            }
-        );
-
-        if (response.data.errors?.length) {
-            console.error('Public Twitch GQL errors:', response.data.errors.map((err) => err.message).join('; '));
-            return null;
-        }
-
-        return response.data.data || null;
-    } catch (e) {
-        console.error('Public Twitch GQL request failed:', e);
-        return null;
+// Transient HTTP errors that warrant a retry (5xx, 408 timeout, 429 rate limit).
+// 4xx (other than 408/429) are application errors and not retried.
+function isTransientAxiosError(err: unknown): boolean {
+    if (!axios.isAxiosError(err)) {
+        // Non-axios errors thrown from axios.post are typically network-layer
+        // failures (DNS, ECONNRESET, socket hangup) — retry those too.
+        return true;
     }
+    const status = err.response?.status;
+    if (status === undefined) {
+        // No response means the request never reached / never returned —
+        // treat as transient (network blip, timeout).
+        return true;
+    }
+    return status === 408 || status === 429 || (status >= 500 && status < 600);
+}
+
+const TWITCH_GQL_RETRY_ATTEMPTS = 3;
+const TWITCH_GQL_RETRY_BASE_DELAY_MS = 400;
+
+async function fetchPublicTwitchGql<T>(query: string, variables: Record<string, unknown>): Promise<T | null> {
+    let lastError: unknown = null;
+
+    for (let attempt = 1; attempt <= TWITCH_GQL_RETRY_ATTEMPTS; attempt++) {
+        try {
+            const response = await axios.post<{ data?: T; errors?: Array<{ message: string }> }>(
+                'https://gql.twitch.tv/gql',
+                { query, variables },
+                {
+                    headers: {
+                        'Client-ID': TWITCH_WEB_CLIENT_ID,
+                        'Content-Type': 'application/json'
+                    },
+                    timeout: API_TIMEOUT
+                }
+            );
+
+            // GraphQL errors (in `errors[]`) are application-level and not
+            // retried — the query itself is rejected.
+            if (response.data.errors?.length) {
+                const messages = response.data.errors.map((err) => err.message).join('; ');
+                appendDebugLog('public-gql-errors', { messages, attempt });
+                console.error('Public Twitch GQL errors:', messages);
+                return null;
+            }
+
+            if (attempt > 1) {
+                appendDebugLog('public-gql-recovered', { attempt });
+            }
+            return response.data.data || null;
+        } catch (e) {
+            lastError = e;
+            const transient = isTransientAxiosError(e);
+            const willRetry = transient && attempt < TWITCH_GQL_RETRY_ATTEMPTS;
+            appendDebugLog('public-gql-failed', {
+                attempt,
+                maxAttempts: TWITCH_GQL_RETRY_ATTEMPTS,
+                transient,
+                willRetry,
+                error: String(e)
+            });
+            if (!willRetry) {
+                break;
+            }
+            // Exponential backoff with jitter
+            const delay = TWITCH_GQL_RETRY_BASE_DELAY_MS * Math.pow(2, attempt - 1) + Math.floor(Math.random() * 250);
+            await sleep(delay);
+        }
+    }
+
+    console.error('Public Twitch GQL request failed:', lastError);
+    return null;
 }
 
 async function getPublicUserId(username: string): Promise<string | null> {
@@ -4050,38 +4098,59 @@ app.whenReady().then(() => {
     });
 });
 
-app.on('window-all-closed', () => {
+// Both window-all-closed and before-quit ran nearly identical cleanup blocks
+// before, with slight drift (only window-all-closed killed children, only
+// window-all-closed did anything platform-specific). Consolidating them into
+// a single idempotent helper means any future tweak (e.g. flushing a new
+// debug stream) lands once and applies on every quit path.
+let shutdownCleanupDone = false;
+
+function shutdownCleanup(reason: 'window-all-closed' | 'before-quit'): void {
+    if (shutdownCleanupDone) return;
+    shutdownCleanupDone = true;
+
+    appendDebugLog('shutdown-cleanup', { reason });
+
     stopMetadataCacheCleanup();
     cleanupMetadataCaches('shutdown');
-    stopDebugLogFlushTimer(true);
     stopAutoUpdatePolling();
 
     // Kill all active children: queue downloads, standalone clip downloads,
-    // and any in-flight cutter/merger/splitter ffmpeg.
+    // and any in-flight cutter/merger/splitter ffmpeg. before-quit used to
+    // skip this entirely; window-all-closed did it but only via direct
+    // kill() (no try/catch around the queue process kill).
     for (const [, tracking] of activeDownloads) {
         if (tracking.process) {
-            tracking.process.kill();
+            try { tracking.process.kill(); } catch { /* already exited */ }
         }
     }
+    activeDownloads.clear();
+
     for (const [, proc] of activeClipProcesses) {
-        try { proc.kill(); } catch { }
+        try { proc.kill(); } catch { /* already exited */ }
     }
+    activeClipProcesses.clear();
+
     if (currentEditorProcess) {
-        currentEditorProcess.kill();
+        try { currentEditorProcess.kill(); } catch { /* already exited */ }
+        currentEditorProcess = null;
     }
+
     saveConfig(config);
     flushQueueSave();
 
+    // Flush debug log AFTER persisting state so any errors saving config /
+    // queue land in the log before the timer is gone.
+    stopDebugLogFlushTimer(true);
+}
+
+app.on('window-all-closed', () => {
+    shutdownCleanup('window-all-closed');
     if (process.platform !== 'darwin') {
         app.quit();
     }
 });
 
 app.on('before-quit', () => {
-    stopMetadataCacheCleanup();
-    cleanupMetadataCaches('shutdown');
-    stopDebugLogFlushTimer(true);
-    stopAutoUpdatePolling();
-    saveConfig(config);
-    flushQueueSave();
+    shutdownCleanup('before-quit');
 });