Files
cloudflare-crawl-mcp/dist/index.js
T
2026-03-11 19:24:29 +00:00

340 lines
13 KiB
JavaScript

import { Server } from "@modelcontextprotocol/sdk/server/index.js";
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
import { CallToolRequestSchema, ListToolsRequestSchema, } from "@modelcontextprotocol/sdk/types.js";
const API_BASE = "https://api.cloudflare.com/client/v4";
const MAX_RETRIES = 3;
const RATE_LIMIT_DELAY_MS = 10000;
let lastRequestTime = 0;
let requestCount = 0;
let windowStart = Date.now();
function getEnv(key) {
const value = process.env[key];
if (!value) {
throw new Error(`Missing required environment variable: ${key}`);
}
return value;
}
async function enforceRateLimit() {
const now = Date.now();
const windowDuration = 60000;
if (now - windowStart >= windowDuration) {
requestCount = 0;
windowStart = now;
}
const requestsPerMinute = parseInt(process.env.CF_RATE_LIMIT || "6", 10);
if (requestCount >= requestsPerMinute) {
const waitTime = windowDuration - (now - windowStart);
console.error(`Rate limit reached (${requestsPerMinute}/min). Waiting ${waitTime}ms...`);
await new Promise((resolve) => setTimeout(resolve, waitTime));
requestCount = 0;
windowStart = Date.now();
}
const timeSinceLastRequest = now - lastRequestTime;
if (timeSinceLastRequest < RATE_LIMIT_DELAY_MS && requestCount > 0) {
const waitTime = RATE_LIMIT_DELAY_MS - timeSinceLastRequest;
await new Promise((resolve) => setTimeout(resolve, waitTime));
}
lastRequestTime = Date.now();
requestCount++;
}
async function fetchWithRetry(fn, retries = MAX_RETRIES) {
let lastError = null;
for (let attempt = 0; attempt < retries; attempt++) {
try {
return await fn();
}
catch (error) {
lastError = error;
const errorStr = error.message || "";
const isRateLimit = errorStr.includes("429") ||
errorStr.includes("Rate limit");
if (!isRateLimit || attempt === retries - 1) {
throw error;
}
const retryAfterMatch = errorStr.match(/Retry-After[:\s]*(\d+)/i);
const delay = retryAfterMatch
? parseInt(retryAfterMatch[1], 10) * 1000
: Math.min(1000 * Math.pow(2, attempt), 30000);
console.error(`Rate limited. Retrying in ${delay}ms...`);
await new Promise((resolve) => setTimeout(resolve, delay));
}
}
throw lastError;
}
async function initiateCrawl(accountId, apiToken, options) {
await enforceRateLimit();
return fetchWithRetry(async () => {
const response = await fetch(`${API_BASE}/accounts/${accountId}/browser-rendering/crawl`, {
method: "POST",
headers: {
Authorization: `Bearer ${apiToken}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
url: options.url,
limit: options.limit ?? 10,
depth: options.depth ?? 1,
formats: options.formats ?? ["markdown"],
render: options.render ?? true,
maxAge: options.maxAge,
source: options.source ?? "all",
options: options.options ?? {},
}),
});
if (!response.ok) {
const error = await response.text();
const retryAfter = response.headers.get("Retry-After");
const errorMsg = `Failed to initiate crawl: ${response.status} ${error}${retryAfter ? ` Retry-After: ${retryAfter}` : ""}`;
throw new Error(errorMsg);
}
const data = await response.json();
if (!data.success) {
throw new Error(`Crawl initiation failed: ${JSON.stringify(data.errors)}`);
}
return data.result.id;
});
}
async function waitForCrawl(accountId, apiToken, jobId, maxAttempts = 120, delayMs = 5000) {
for (let i = 0; i < maxAttempts; i++) {
const response = await fetch(`${API_BASE}/accounts/${accountId}/browser-rendering/crawl/${jobId}?limit=1`, {
headers: {
Authorization: `Bearer ${apiToken}`,
},
});
if (!response.ok) {
const error = await response.text();
throw new Error(`Failed to check crawl status: ${response.status} ${error}`);
}
const data = await response.json();
const status = data.result.status;
if (status !== "running") {
return data.result;
}
await new Promise((resolve) => setTimeout(resolve, delayMs));
}
throw new Error("Crawl job did not complete within timeout");
}
function buildCrawlOptions(args, formats) {
return {
url: args.url,
limit: args.limit,
depth: args.depth,
formats,
render: args.render,
options: {
includeExternalLinks: args.includeExternalLinks,
includeSubdomains: args.includeSubdomains,
includePatterns: args.includePatterns,
excludePatterns: args.excludePatterns,
},
};
}
function formatMarkdownResult(result) {
const records = result.records || [];
const completedRecords = records.filter((r) => r.status === "completed");
const content = completedRecords
.map((record) => {
const title = record.metadata?.title || record.url;
return `## ${title}\n\nURL: ${record.url}\n\n${record.markdown || ""}\n\n---\n`;
})
.join("\n");
return `Crawl completed: ${completedRecords.length} of ${result.total} pages crawled successfully.\n\n${content}`;
}
function formatHtmlResult(result) {
const records = result.records || [];
const completedRecords = records.filter((r) => r.status === "completed");
const content = completedRecords
.map((record) => {
const title = record.metadata?.title || record.url;
return `<article>\n <h2>${title}</h2>\n <p>Source: <a href="${record.url}">${record.url}</a></p>\n <div class="content">${record.html || ""}</div>\n</article>\n`;
})
.join("\n");
return `Crawl completed: ${completedRecords.length} of ${result.total} pages crawled successfully.\n\n${content}`;
}
function formatJsonResult(result) {
const records = result.records || [];
const completedRecords = records.filter((r) => r.status === "completed");
const jsonOutput = {
summary: {
total: result.total,
completed: completedRecords.length,
status: result.status,
},
pages: completedRecords.map((record) => ({
url: record.url,
title: record.metadata?.title,
status: record.metadata?.status,
markdown: record.markdown,
html: record.html,
json: record.json,
})),
};
return JSON.stringify(jsonOutput, null, 2);
}
function handleErrorResult(result, jobId) {
const errorMessages = {
errored: `Crawl job errored. Job ID: ${jobId}`,
cancelled_due_to_timeout: `Crawl job cancelled due to timeout (7 days max). Job ID: ${jobId}`,
cancelled_due_to_limits: `Crawl job cancelled due to account limits. Job ID: ${jobId}`,
cancelled_by_user: `Crawl job was cancelled by user. Job ID: ${jobId}`,
};
const message = errorMessages[result.status] || `Crawl job failed with status: ${result.status}. Job ID: ${jobId}`;
return {
content: [{ type: "text", text: message }],
isError: true,
};
}
const server = new Server({
name: "cloudflare-crawl-mcp",
version: "1.0.0",
}, {
capabilities: {
tools: {},
},
});
const baseToolSchema = {
type: "object",
properties: {
url: {
type: "string",
description: "The starting URL to crawl",
},
limit: {
type: "number",
description: "Maximum number of pages to crawl (default: 10, max: 100000)",
},
depth: {
type: "number",
description: "Maximum link depth to crawl from the starting URL (default: 1)",
},
includeSubdomains: {
type: "boolean",
description: "If true, follows links to subdomains of the starting URL (default: false)",
},
includeExternalLinks: {
type: "boolean",
description: "If true, follows links to external domains (default: false)",
},
includePatterns: {
type: "array",
items: { type: "string" },
description: "Only visits URLs that match one of these wildcard patterns",
},
excludePatterns: {
type: "array",
items: { type: "string" },
description: "Does not visit URLs that match any of these wildcard patterns",
},
render: {
type: "boolean",
description: "If false, does a fast HTML fetch without executing JavaScript (default: true)",
},
},
required: ["url"],
};
const RATE_LIMIT_INFO = `
---
**Cloudflare Browser Rendering Limits:**
| Plan | Concurrent Browsers | Browser Time | REST API Rate |
|------|---------------------|--------------|---------------|
| Free | 3 | 10 min/day | 6 req/min |
| Paid | 10 | 10 hours/month | 600 req/min |
**Environment Variables:**
- CF_RATE_LIMIT: Override REST API requests per minute (default: 6 for Free, 600 for Paid)
**Tips:**
- Use \`render: false\` for static content to avoid browser time usage
- Use \`maxAge\` to cache results and reduce API calls
- Set \`limit\` and \`depth\` appropriately to stay within limits
---`;
server.setRequestHandler(ListToolsRequestSchema, async () => {
return {
tools: [
{
name: "crawl_url_markdown",
description: `Crawl a website using Cloudflare Browser Rendering and return content in Markdown format. Supports following links across the site up to a configurable depth or page limit.${RATE_LIMIT_INFO}`,
inputSchema: {
...baseToolSchema,
properties: {
...baseToolSchema.properties,
},
},
},
{
name: "crawl_url_html",
description: `Crawl a website using Cloudflare Browser Rendering and return content in HTML format. Supports following links across the site up to a configurable depth or page limit.${RATE_LIMIT_INFO}`,
inputSchema: baseToolSchema,
},
{
name: "crawl_url_json",
description: `Crawl a website using Cloudflare Browser Rendering and return content in JSON format. This uses Workers AI for data extraction. Supports following links across the site up to a configurable depth or page limit.${RATE_LIMIT_INFO}`,
inputSchema: baseToolSchema,
},
],
};
});
server.setRequestHandler(CallToolRequestSchema, async (request) => {
const { name, arguments: args } = request.params;
const toolMatch = name.match(/^crawl_url_(markdown|html|json)$/);
if (!toolMatch) {
return {
content: [{ type: "text", text: `Unknown tool: ${name}` }],
isError: true,
};
}
const format = toolMatch[1];
const formatMap = {
markdown: ["markdown"],
html: ["html"],
json: ["json"],
};
const formats = formatMap[format];
try {
const apiToken = getEnv("CF_API_TOKEN");
const accountId = getEnv("CF_ACCOUNT_ID");
const crawlArgs = {
url: args.url,
limit: args.limit,
depth: args.depth,
includeSubdomains: args.includeSubdomains,
includeExternalLinks: args.includeExternalLinks,
includePatterns: args.includePatterns,
excludePatterns: args.excludePatterns,
render: args.render,
};
const options = buildCrawlOptions(crawlArgs, formats);
const jobId = await initiateCrawl(accountId, apiToken, options);
const result = await waitForCrawl(accountId, apiToken, jobId);
const terminalStatuses = ["errored", "cancelled_due_to_timeout", "cancelled_due_to_limits", "cancelled_by_user"];
if (terminalStatuses.includes(result.status)) {
return handleErrorResult(result, jobId);
}
const formatterMap = {
markdown: formatMarkdownResult,
html: formatHtmlResult,
json: formatJsonResult,
};
const formattedContent = formatterMap[format](result);
return {
content: [{ type: "text", text: formattedContent }],
};
}
catch (error) {
const message = error instanceof Error ? error.message : String(error);
return {
content: [{ type: "text", text: `Error: ${message}` }],
isError: true,
};
}
});
async function main() {
const transport = new StdioServerTransport();
await server.connect(transport);
}
main().catch((error) => {
console.error("Server error:", error);
process.exit(1);
});