Add job logging and increase timeout to 20 minutes

- Add JobLogger class to handler.py for structured timestamped logging - Increase MAX_TIMEOUT from 600s to 1200s (20 minutes) - Add logs column to generated_content table via migration - Store and display job execution logs in gallery UI - Add Logs button to gallery items with modal display Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-08 02:10:55 +00:00
parent 52dd0d8766
commit 3c421cf7b8
10 changed files with 269 additions and 46 deletions
--- a/frontend/public/css/style.css
+++ b/frontend/public/css/style.css
@@ -766,3 +766,65 @@ body {
    text-shadow: 0 1px 3px rgba(0,0,0,0.5);
    pointer-events: none;
 }
+
+/* Logs Modal */
+.logs-container {
+    background: var(--gray-100);
+    border-radius: var(--radius);
+    padding: 15px;
+}
+
+.logs-status {
+    margin-bottom: 12px;
+    font-size: 14px;
+}
+
+.logs-status .badge {
+    margin-left: 8px;
+}
+
+.badge-completed { background: var(--success); color: white; }
+.badge-processing { background: var(--warning); color: var(--gray-900); }
+.badge-pending { background: var(--gray-500); color: white; }
+.badge-failed { background: var(--danger); color: white; }
+
+.logs-error {
+    background: #f8d7da;
+    color: #721c24;
+    padding: 12px;
+    border-radius: var(--radius);
+    margin-bottom: 12px;
+    font-size: 13px;
+}
+
+.logs-content {
+    font-family: 'SF Mono', Monaco, 'Andale Mono', monospace;
+    font-size: 12px;
+    line-height: 1.6;
+    background: var(--gray-900);
+    color: #e0e0e0;
+    padding: 15px;
+    border-radius: var(--radius);
+    max-height: 400px;
+    overflow-y: auto;
+    white-space: pre-wrap;
+    word-break: break-word;
+}
+
+.logs-content::-webkit-scrollbar {
+    width: 8px;
+}
+
+.logs-content::-webkit-scrollbar-track {
+    background: var(--gray-800);
+    border-radius: 4px;
+}
+
+.logs-content::-webkit-scrollbar-thumb {
+    background: var(--gray-600);
+    border-radius: 4px;
+}
+
+.logs-content::-webkit-scrollbar-thumb:hover {
+    background: var(--gray-500);
+}
--- a/frontend/public/js/app.js
+++ b/frontend/public/js/app.js
@@ -437,6 +437,7 @@ function renderGallery(container, items) {
                <div class="gallery-item-meta">
                    <span>${formatDate(item.createdAt)}</span>
                    <div class="gallery-item-actions">
+                        <button class="btn btn-sm btn-secondary view-logs-btn" data-content-id="${item.id}">Logs</button>
                        ${item.status === 'completed' ? `<a href="/api/content/${item.id}/download" class="btn btn-sm btn-secondary">Download</a>` : ''}
                        <button class="btn btn-sm btn-danger delete-content-btn" data-content-id="${item.id}">Delete</button>
                    </div>
@@ -470,6 +471,48 @@ function renderGallery(container, items) {
            }
        });
    });
+
+    container.querySelectorAll('.view-logs-btn').forEach(btn => {
+        btn.addEventListener('click', async function() {
+            const contentId = parseInt(this.dataset.contentId, 10);
+            await viewContentLogs(contentId);
+        });
+    });
+}
+
+async function viewContentLogs(contentId) {
+    try {
+        const data = await api(`/content/${contentId}/logs`);
+        const logs = data.logs || [];
+        const errorMessage = data.errorMessage;
+
+        let logsHtml = '';
+        if (logs.length === 0) {
+            logsHtml = '<p style="color:var(--gray-500);text-align:center;">No logs available</p>';
+        } else {
+            logsHtml = `<div class="logs-content">${logs.map(log => escapeHtml(log)).join('\n')}</div>`;
+        }
+
+        if (errorMessage) {
+            logsHtml = `<div class="logs-error"><strong>Error:</strong> ${escapeHtml(errorMessage)}</div>` + logsHtml;
+        }
+
+        showModal(`
+            <div class="modal-header">
+                <h3>Job Logs</h3>
+                <button class="modal-close">&times;</button>
+            </div>
+            <div class="logs-container">
+                <div class="logs-status">Status: <span class="badge badge-${data.status}">${data.status}</span></div>
+                ${logsHtml}
+            </div>
+            <div class="modal-footer">
+                <button class="btn btn-secondary modal-cancel-btn">Close</button>
+            </div>
+        `);
+    } catch (error) {
+        alert('Failed to load logs: ' + error.message);
+    }
 }

 // Admin
--- a/frontend/src/db/index.ts
+++ b/frontend/src/db/index.ts
@@ -50,6 +50,7 @@ function runMigrations(database: Database.Database): void {
  // Migration files in order
  const migrations = [
    { version: 1, file: '001_initial.sql' },
+    { version: 2, file: '002_add_logs.sql' },
  ];

  for (const migration of migrations) {
@@ -131,5 +132,6 @@ export interface GeneratedContentRow {
  mime_type: string;
  status: 'pending' | 'processing' | 'completed' | 'failed';
  error_message: string | null;
+  logs: string | null;
  created_at: string;
 }
--- a/frontend/src/db/migrations/002_add_logs.sql
+++ b/frontend/src/db/migrations/002_add_logs.sql
@@ -0,0 +1,2 @@
+-- Add logs column to store job execution logs
+ALTER TABLE generated_content ADD COLUMN logs TEXT;
--- a/frontend/src/routes/content.ts
+++ b/frontend/src/routes/content.ts
@@ -103,6 +103,35 @@ router.get('/:id', (req, res) => {
  });
 });

+// Get content logs
+router.get('/:id/logs', (req, res) => {
+  const authReq = req as AuthenticatedRequest;
+  const contentId = parseInt(req.params.id, 10);
+
+  if (isNaN(contentId)) {
+    res.status(400).json({ error: 'Invalid content ID' });
+    return;
+  }
+
+  const content = getContentById(contentId);
+  if (!content) {
+    res.status(404).json({ error: 'Content not found' });
+    return;
+  }
+
+  if (!canAccessResource(authReq.user, content.userId)) {
+    res.status(403).json({ error: 'Access denied' });
+    return;
+  }
+
+  res.json({
+    contentId: content.id,
+    status: content.status,
+    logs: content.logs || [],
+    errorMessage: content.errorMessage,
+  });
+});
+
 // Download content file
 router.get('/:id/download', (req, res) => {
  const authReq = req as AuthenticatedRequest;
--- a/frontend/src/routes/generate.ts
+++ b/frontend/src/routes/generate.ts
@@ -123,9 +123,16 @@ router.get('/:jobId/status', asyncHandler(async (req, res) => {
        if (output.data) {
          // Save base64 data to file
          saveContentFile(row.id, output.data);
+          // Also save logs if present
+          if (status.output.logs) {
+            updateContentStatus(row.id, 'completed', { logs: status.output.logs });
+          }
        } else if (output.path) {
-          // File was saved to volume - update status
-          updateContentStatus(row.id, 'completed', { fileSize: output.size });
+          // File was saved to volume - update status with logs
+          updateContentStatus(row.id, 'completed', {
+            fileSize: output.size,
+            logs: status.output.logs,
+          });
        }
      }
    } else if (status.status === 'FAILED') {
@@ -138,6 +145,7 @@ router.get('/:jobId/status', asyncHandler(async (req, res) => {
      if (row) {
        updateContentStatus(row.id, 'failed', {
          errorMessage: status.error || status.output?.error || 'Unknown error',
+          logs: status.output?.logs,
        });
      }
    }
@@ -182,6 +190,7 @@ router.get('/content/:contentId/status', (req, res) => {
    status: content.status,
    runpodJobId: content.runpodJobId,
    errorMessage: content.errorMessage,
+    logs: content.logs,
  });
 });

--- a/frontend/src/services/contentService.ts
+++ b/frontend/src/services/contentService.ts
@@ -23,6 +23,7 @@ function rowToContent(row: GeneratedContentRow): GeneratedContent {
    mimeType: row.mime_type,
    status: row.status,
    errorMessage: row.error_message,
+    logs: row.logs ? JSON.parse(row.logs) : null,
    createdAt: new Date(row.created_at),
  };
 }
@@ -68,6 +69,7 @@ export function updateContentStatus(
    runpodJobId?: string;
    fileSize?: number;
    errorMessage?: string;
+    logs?: string[];
  }
 ): GeneratedContent | null {
  const db = getDb();
@@ -87,6 +89,10 @@ export function updateContentStatus(
    setParts.push('error_message = ?');
    values.push(updates.errorMessage);
  }
+  if (updates?.logs !== undefined) {
+    setParts.push('logs = ?');
+    values.push(JSON.stringify(updates.logs));
+  }

  values.push(id);

--- a/frontend/src/services/jobProcessor.ts
+++ b/frontend/src/services/jobProcessor.ts
@@ -58,13 +58,21 @@ async function processStuckJobs(): Promise<void> {
        const output = status.output.outputs[0];
        if (output.data) {
          saveContentFile(job.id, output.data);
+          // Also save logs if present
+          if (status.output.logs) {
+            updateContentStatus(job.id, 'completed', { logs: status.output.logs });
+          }
        } else {
-          updateContentStatus(job.id, 'completed', { fileSize: output.size });
+          updateContentStatus(job.id, 'completed', {
+            fileSize: output.size,
+            logs: status.output.logs,
+          });
        }
        logger.info({ contentId: job.id }, 'Background processor completed job');
      } else if (status.status === 'FAILED') {
        updateContentStatus(job.id, 'failed', {
          errorMessage: status.error || status.output?.error || 'Job failed',
+          logs: status.output?.logs,
        });
        logger.info({ contentId: job.id }, 'Background processor marked job as failed');
      }
--- a/frontend/src/types/index.ts
+++ b/frontend/src/types/index.ts
@@ -68,6 +68,7 @@ export interface GeneratedContent {
  mimeType: string;
  status: 'pending' | 'processing' | 'completed' | 'failed';
  errorMessage: string | null;
+  logs: string[] | null;
  createdAt: Date;
 }

@@ -99,6 +100,7 @@ export interface RunPodJobStatus extends RunPodJob {
      size?: number;
    }>;
    error?: string;
+    logs?: string[];
  };
  error?: string;
 }
--- a/handler.py
+++ b/handler.py
@@ -28,7 +28,7 @@ import runpod
 COMFYUI_DIR = "/workspace/ComfyUI"
 COMFYUI_PORT = 8188
 COMFYUI_HOST = f"http://127.0.0.1:{COMFYUI_PORT}"
-MAX_TIMEOUT = 600  # 10 minutes max for video generation
+MAX_TIMEOUT = 1200  # 20 minutes max for video generation
 POLL_INTERVAL = 1.0
 STARTUP_TIMEOUT = 120
 DEFAULT_WORKFLOW_PATH = "/workspace/workflows/Wan22-I2V-Remix-API.json"
@@ -46,14 +46,41 @@ NODE_SAVE_VIDEO = "117"
 comfyui_process = None


-def start_comfyui():
+class JobLogger:
+    """Accumulates log messages with timestamps for returning to the client."""
+
+    def __init__(self):
+        self.logs = []
+        self.start_time = time.time()
+
+    def log(self, message: str):
+        """Log a message with timestamp."""
+        timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
+        elapsed = time.time() - self.start_time
+        entry = f"[{timestamp}] [{elapsed:.1f}s] {message}"
+        self.logs.append(entry)
+        print(entry)  # Also print for RunPod console
+
+    def get_logs(self) -> list:
+        """Return all accumulated logs."""
+        return self.logs
+
+
+def start_comfyui(logger: JobLogger = None):
    """Start ComfyUI server if not already running."""
    global comfyui_process

+    def log(msg):
+        if logger:
+            logger.log(msg)
+        else:
+            print(msg)
+
    if comfyui_process is not None and comfyui_process.poll() is None:
+        log("ComfyUI server already running")
        return True

-    print("Starting ComfyUI server...")
+    log("Starting ComfyUI server...")

    comfyui_process = subprocess.Popen(
        [
@@ -70,17 +97,25 @@ def start_comfyui():

    # Wait for server to be ready
    start_time = time.time()
+    last_status_time = start_time
    while time.time() - start_time < STARTUP_TIMEOUT:
        try:
            resp = requests.get(f"{COMFYUI_HOST}/system_stats", timeout=2)
            if resp.status_code == 200:
-                print("ComfyUI server ready")
+                log("ComfyUI server ready")
                return True
        except requests.exceptions.RequestException:
            pass
+
+        # Log progress every 15 seconds
+        if time.time() - last_status_time >= 15:
+            elapsed = int(time.time() - start_time)
+            log(f"Waiting for ComfyUI startup... ({elapsed}s)")
+            last_status_time = time.time()
+
        time.sleep(1)

-    print("ComfyUI server failed to start")
+    log("ComfyUI server failed to start after {STARTUP_TIMEOUT}s")
    return False


@@ -305,9 +340,16 @@ def get_history(prompt_id: str) -> dict:
    return resp.json()


-def poll_for_completion(prompt_id: str, timeout: int = MAX_TIMEOUT) -> dict:
+def poll_for_completion(prompt_id: str, timeout: int = MAX_TIMEOUT, logger: JobLogger = None) -> dict:
    """Poll until workflow completes or timeout."""
    start_time = time.time()
+    last_log_time = start_time
+
+    def log(msg):
+        if logger:
+            logger.log(msg)
+        else:
+            print(msg)

    while time.time() - start_time < timeout:
        history = get_history(prompt_id)
@@ -323,6 +365,13 @@ def poll_for_completion(prompt_id: str, timeout: int = MAX_TIMEOUT) -> dict:
            if status.get("status_str") == "error":
                raise Exception(f"Workflow execution failed: {status}")

+        # Log progress every 30 seconds
+        if time.time() - last_log_time >= 30:
+            elapsed = int(time.time() - start_time)
+            remaining = timeout - elapsed
+            log(f"Generating... ({elapsed}s elapsed, {remaining}s remaining)")
+            last_log_time = time.time()
+
        time.sleep(POLL_INTERVAL)

    raise TimeoutError(f"Workflow execution timed out after {timeout}s")
@@ -419,37 +468,55 @@ def handler(job: dict) -> dict:
        "resolution": 720 (optional, default 720),
        "steps": 8 (optional, default 8),
        "split_step": 4 (optional, default 4),
-        "timeout": 600 (optional, max 600),
+        "timeout": 1200 (optional, max 1200),
        "workflow": {} (optional, override default workflow)
    }
    """
+    logger = JobLogger()
    job_input = job.get("input", {})

+    logger.log("Job started")
+
    # Validate required inputs
    if "image" not in job_input or not job_input["image"]:
-        return {"error": "Missing required 'image' (base64) in input"}
+        logger.log("ERROR: Missing required 'image' (base64) in input")
+        return {"error": "Missing required 'image' (base64) in input", "logs": logger.get_logs()}

    if "prompt" not in job_input or not job_input["prompt"]:
-        return {"error": "Missing required 'prompt' in input"}
+        logger.log("ERROR: Missing required 'prompt' in input")
+        return {"error": "Missing required 'prompt' in input", "logs": logger.get_logs()}
+
+    # Log input parameters
+    image_size_kb = len(job_input["image"]) * 3 / 4 / 1024  # Approximate decoded size
+    logger.log(f"Input image size: ~{image_size_kb:.1f} KB (base64)")
+    logger.log(f"Prompt: {job_input['prompt'][:100]}{'...' if len(job_input['prompt']) > 100 else ''}")
+    if job_input.get("negative_prompt"):
+        logger.log(f"Negative prompt: {job_input['negative_prompt'][:50]}...")
+    logger.log(f"Resolution: {job_input.get('resolution', 720)}")
+    logger.log(f"Steps: {job_input.get('steps', 8)}")
+    logger.log(f"Split step: {job_input.get('split_step', 4)}")

    # Ensure ComfyUI is running
-    if not start_comfyui():
-        return {"error": "Failed to start ComfyUI server"}
+    if not start_comfyui(logger):
+        logger.log("ERROR: Failed to start ComfyUI server")
+        return {"error": "Failed to start ComfyUI server", "logs": logger.get_logs()}

    try:
        # Load workflow (custom or default)
        if "workflow" in job_input and job_input["workflow"]:
+            logger.log("Loading custom workflow")
            workflow = job_input["workflow"]
-            # Convert frontend format if needed
            workflow = convert_frontend_to_api(workflow)
        else:
-            # Load and convert default workflow
+            logger.log("Loading default workflow")
            frontend_workflow = load_default_workflow()
            workflow = convert_frontend_to_api(frontend_workflow)

+        logger.log(f"Workflow loaded with {len(workflow)} nodes")
+
        # Upload image
        image_filename = upload_image(job_input["image"])
-        print(f"Uploaded image: {image_filename}")
+        logger.log(f"Uploaded image: {image_filename}")

        # Build params for injection
        params = {
@@ -471,48 +538,34 @@ def handler(job: dict) -> dict:

        # Inject parameters into workflow
        workflow = inject_wan22_params(workflow, params)
-
-        # Debug: print output chain nodes to verify connections
-        print("=== Workflow Output Chain ===")
-        # Check the output chain: 117 <- 116 <- 115 <- 158 <- 140
-        for node_id in ["117", "116", "115"]:
-            if node_id in workflow:
-                node = workflow[node_id]
-                print(f"Node {node_id} ({node['class_type']}): {node['inputs']}")
-            else:
-                print(f"Node {node_id}: MISSING FROM WORKFLOW!")
-        print(f"Total nodes in workflow: {len(workflow)}")
+        logger.log("Parameters injected into workflow")

        # Queue workflow
        client_id = uuid.uuid4().hex
        prompt_id = queue_workflow(workflow, client_id)
-        print(f"Queued workflow: {prompt_id}")
+        logger.log(f"Workflow queued: {prompt_id}")

        # Poll for completion
        timeout = min(job_input.get("timeout", MAX_TIMEOUT), MAX_TIMEOUT)
-        history = poll_for_completion(prompt_id, timeout)
-        print("Workflow completed")
-
-        # Debug: print history structure
-        print(f"History keys: {history.keys()}")
-        if "outputs" in history:
-            print(f"Output nodes: {list(history['outputs'].keys())}")
-            for node_id, node_out in history["outputs"].items():
-                print(f"  Node {node_id}: {list(node_out.keys())}")
-        if "status" in history:
-            print(f"Status: {history['status']}")
+        logger.log(f"Starting generation (timeout: {timeout}s)")
+        history = poll_for_completion(prompt_id, timeout, logger)
+        logger.log("Workflow completed successfully")

        # Get output files
        outputs = get_output_files(history)

        if not outputs:
-            return {"error": "No outputs generated"}
+            logger.log("ERROR: No outputs generated")
+            return {"error": "No outputs generated", "logs": logger.get_logs()}
+
+        logger.log(f"Found {len(outputs)} output file(s)")

        # Fetch and encode outputs
        results = []
        for output_info in outputs:
            data = fetch_output(output_info)
-            print(f"Fetched output: {output_info['filename']} ({len(data)} bytes)")
+            size_mb = len(data) / 1024 / 1024
+            logger.log(f"Fetched output: {output_info['filename']} ({size_mb:.2f} MB)")

            # Check size for video files
            if output_info["type"] == "video" and len(data) > 10 * 1024 * 1024:
@@ -520,6 +573,7 @@ def handler(job: dict) -> dict:
                output_path = Path("/runpod-volume/outputs") / output_info["filename"]
                output_path.parent.mkdir(parents=True, exist_ok=True)
                output_path.write_bytes(data)
+                logger.log(f"Large video saved to volume: {output_path}")
                results.append({
                    "type": output_info["type"],
                    "filename": output_info["filename"],
@@ -535,18 +589,24 @@ def handler(job: dict) -> dict:
                    "size": len(data)
                })

+        logger.log("Job completed successfully")
+
        return {
            "status": "success",
            "prompt_id": prompt_id,
-            "outputs": results
+            "outputs": results,
+            "logs": logger.get_logs()
        }

    except TimeoutError as e:
-        return {"error": str(e), "status": "timeout"}
+        logger.log(f"ERROR: Timeout - {str(e)}")
+        return {"error": str(e), "status": "timeout", "logs": logger.get_logs()}
    except Exception as e:
        import traceback
-        traceback.print_exc()
-        return {"error": str(e), "status": "error"}
+        tb = traceback.format_exc()
+        logger.log(f"ERROR: {str(e)}")
+        logger.log(f"Traceback:\n{tb}")
+        return {"error": str(e), "status": "error", "logs": logger.get_logs()}


 # RunPod serverless entry point