Better proxy handling.

This commit is contained in:
Mario Zechner 2025-10-28 00:21:54 +01:00
parent ec50ede6c2
commit b6b64dff86
6 changed files with 298 additions and 160 deletions

View file

@ -5,6 +5,7 @@ import { createRef, ref } from "lit/directives/ref.js";
import { FileText } from "lucide";
import { EXTRACT_DOCUMENT_DESCRIPTION } from "../prompts/prompts.js";
import { loadAttachment } from "../utils/attachment-utils.js";
import { isCorsError } from "../utils/proxy-utils.js";
import { registerToolRenderer, renderCollapsibleHeader, renderHeader } from "./renderer-registry.js";
import type { ToolRenderer, ToolRenderResult } from "./types.js";
@ -34,13 +35,13 @@ export interface ExtractDocumentResult {
export function createExtractDocumentTool(): AgentTool<typeof extractDocumentSchema, ExtractDocumentResult> & {
corsProxyUrl?: string;
} {
return {
const tool = {
label: "Extract Document",
name: "extract_document",
corsProxyUrl: undefined, // Can be set by consumer (e.g., from user settings)
corsProxyUrl: undefined as string | undefined, // Can be set by consumer (e.g., from user settings)
description: EXTRACT_DOCUMENT_DESCRIPTION,
parameters: extractDocumentSchema,
execute: async function (_toolCallId: string, args: ExtractDocumentParams, signal?: AbortSignal) {
execute: async (_toolCallId: string, args: ExtractDocumentParams, signal?: AbortSignal) => {
if (signal?.aborted) {
throw new Error("Extract document aborted");
}
@ -57,17 +58,11 @@ export function createExtractDocumentTool(): AgentTool<typeof extractDocumentSch
throw new Error(`Invalid URL: ${url}`);
}
// Determine fetch URL (with or without CORS proxy)
let fetchUrl = url;
if (this.corsProxyUrl) {
fetchUrl = this.corsProxyUrl + encodeURIComponent(url);
}
// Size limit: 50MB
const MAX_SIZE = 50 * 1024 * 1024;
try {
// Attempt to fetch the document
// Helper function to fetch and process document
const fetchAndProcess = async (fetchUrl: string) => {
const response = await fetch(fetchUrl, { signal });
if (!response.ok) {
@ -98,52 +93,31 @@ export function createExtractDocumentTool(): AgentTool<typeof extractDocumentSch
);
}
// Extract filename from URL
const urlParts = url.split("/");
let fileName = urlParts[urlParts.length - 1]?.split("?")[0] || "document";
if (url.startsWith("https://arxiv.org/")) {
fileName = fileName + ".pdf";
}
return arrayBuffer;
};
// Use loadAttachment to process the document
const attachment = await loadAttachment(arrayBuffer, fileName);
// Try without proxy first, fallback to proxy on CORS error
let arrayBuffer: ArrayBuffer;
if (!attachment.extractedText) {
const mimeType = response.headers.get("content-type") || "unknown";
throw new Error(
`Document format not supported. Supported formats:\n` +
`- PDF (.pdf)\n` +
`- Word (.docx)\n` +
`- Excel (.xlsx, .xls)\n` +
`- PowerPoint (.pptx)\n\n` +
`Detected: ${mimeType}`,
);
}
// Determine format from attachment
let format = "unknown";
if (attachment.mimeType.includes("pdf")) {
format = "pdf";
} else if (attachment.mimeType.includes("wordprocessingml")) {
format = "docx";
} else if (attachment.mimeType.includes("spreadsheetml") || attachment.mimeType.includes("ms-excel")) {
format = "xlsx";
} else if (attachment.mimeType.includes("presentationml")) {
format = "pptx";
}
return {
output: attachment.extractedText,
details: {
extractedText: attachment.extractedText,
format,
fileName: attachment.fileName,
size: attachment.size,
},
};
} catch (error: any) {
// Handle CORS errors specifically
if (error.name === "TypeError" && error.message.includes("Failed to fetch")) {
try {
// Attempt direct fetch first
arrayBuffer = await fetchAndProcess(url);
} catch (directError: any) {
// If CORS error and proxy is available, retry with proxy
if (isCorsError(directError) && tool.corsProxyUrl) {
try {
const proxiedUrl = tool.corsProxyUrl + encodeURIComponent(url);
arrayBuffer = await fetchAndProcess(proxiedUrl);
} catch (proxyError: any) {
// Proxy fetch also failed - throw helpful message
throw new Error(
`TELL USER: Unable to fetch the document due to CORS restrictions.\n\n` +
`Tried with proxy but it also failed: ${proxyError.message}\n\n` +
`INSTRUCT USER: Please download the file manually and attach it to your message using the attachment button (paperclip icon) in the message input area. I can then extract the text from the attached file.`,
);
}
} else if (isCorsError(directError) && !tool.corsProxyUrl) {
// CORS error but no proxy configured
throw new Error(
`TELL USER: Unable to fetch the document due to CORS restrictions (the server blocks requests from browser extensions).\n\n` +
`To fix this, you need to configure a CORS proxy in Sitegeist settings:\n` +
@ -151,15 +125,58 @@ export function createExtractDocumentTool(): AgentTool<typeof extractDocumentSch
`2. Find "CORS Proxy URL" setting\n` +
`3. Enter a proxy URL like: https://corsproxy.io/?\n` +
`4. Save and try again\n\n` +
`Would you like me to explain what a CORS proxy is and how to set one up?`,
`Alternatively, download the file manually and attach it to your message using the attachment button (paperclip icon).`,
);
} else {
// Not a CORS error - re-throw
throw directError;
}
// Re-throw other errors
throw error;
}
// Extract filename from URL
const urlParts = url.split("/");
let fileName = urlParts[urlParts.length - 1]?.split("?")[0] || "document";
if (url.startsWith("https://arxiv.org/")) {
fileName = fileName + ".pdf";
}
// Use loadAttachment to process the document
const attachment = await loadAttachment(arrayBuffer, fileName);
if (!attachment.extractedText) {
throw new Error(
`Document format not supported. Supported formats:\n` +
`- PDF (.pdf)\n` +
`- Word (.docx)\n` +
`- Excel (.xlsx, .xls)\n` +
`- PowerPoint (.pptx)`,
);
}
// Determine format from attachment
let format = "unknown";
if (attachment.mimeType.includes("pdf")) {
format = "pdf";
} else if (attachment.mimeType.includes("wordprocessingml")) {
format = "docx";
} else if (attachment.mimeType.includes("spreadsheetml") || attachment.mimeType.includes("ms-excel")) {
format = "xlsx";
} else if (attachment.mimeType.includes("presentationml")) {
format = "pptx";
}
return {
output: attachment.extractedText,
details: {
extractedText: attachment.extractedText,
format,
fileName: attachment.fileName,
size: attachment.size,
},
};
},
};
return tool;
}
// Export a default instance
@ -214,7 +231,7 @@ export const extractDocumentRenderer: ToolRenderer<ExtractDocumentParams, Extrac
}
${
result.isError && output
? html`<console-block .content=${output} .variant="error"></console-block>`
? html`<console-block .content=${output} .variant=${"error"}></console-block>`
: ""
}
</div>