mirror of
https://github.com/lukaszraczylo/cloudflare-crawl-mcp.git
synced 2026-06-08 23:03:38 +00:00
340 lines
13 KiB
JavaScript
340 lines
13 KiB
JavaScript
import { Server } from "@modelcontextprotocol/sdk/server/index.js";
|
|
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
|
|
import { CallToolRequestSchema, ListToolsRequestSchema, } from "@modelcontextprotocol/sdk/types.js";
|
|
const API_BASE = "https://api.cloudflare.com/client/v4";
|
|
const MAX_RETRIES = 3;
|
|
const RATE_LIMIT_DELAY_MS = 10000;
|
|
let lastRequestTime = 0;
|
|
let requestCount = 0;
|
|
let windowStart = Date.now();
|
|
function getEnv(key) {
|
|
const value = process.env[key];
|
|
if (!value) {
|
|
throw new Error(`Missing required environment variable: ${key}`);
|
|
}
|
|
return value;
|
|
}
|
|
async function enforceRateLimit() {
|
|
const now = Date.now();
|
|
const windowDuration = 60000;
|
|
if (now - windowStart >= windowDuration) {
|
|
requestCount = 0;
|
|
windowStart = now;
|
|
}
|
|
const requestsPerMinute = parseInt(process.env.CF_RATE_LIMIT || "6", 10);
|
|
if (requestCount >= requestsPerMinute) {
|
|
const waitTime = windowDuration - (now - windowStart);
|
|
console.error(`Rate limit reached (${requestsPerMinute}/min). Waiting ${waitTime}ms...`);
|
|
await new Promise((resolve) => setTimeout(resolve, waitTime));
|
|
requestCount = 0;
|
|
windowStart = Date.now();
|
|
}
|
|
const timeSinceLastRequest = now - lastRequestTime;
|
|
if (timeSinceLastRequest < RATE_LIMIT_DELAY_MS && requestCount > 0) {
|
|
const waitTime = RATE_LIMIT_DELAY_MS - timeSinceLastRequest;
|
|
await new Promise((resolve) => setTimeout(resolve, waitTime));
|
|
}
|
|
lastRequestTime = Date.now();
|
|
requestCount++;
|
|
}
|
|
async function fetchWithRetry(fn, retries = MAX_RETRIES) {
|
|
let lastError = null;
|
|
for (let attempt = 0; attempt < retries; attempt++) {
|
|
try {
|
|
return await fn();
|
|
}
|
|
catch (error) {
|
|
lastError = error;
|
|
const errorStr = error.message || "";
|
|
const isRateLimit = errorStr.includes("429") ||
|
|
errorStr.includes("Rate limit");
|
|
if (!isRateLimit || attempt === retries - 1) {
|
|
throw error;
|
|
}
|
|
const retryAfterMatch = errorStr.match(/Retry-After[:\s]*(\d+)/i);
|
|
const delay = retryAfterMatch
|
|
? parseInt(retryAfterMatch[1], 10) * 1000
|
|
: Math.min(1000 * Math.pow(2, attempt), 30000);
|
|
console.error(`Rate limited. Retrying in ${delay}ms...`);
|
|
await new Promise((resolve) => setTimeout(resolve, delay));
|
|
}
|
|
}
|
|
throw lastError;
|
|
}
|
|
async function initiateCrawl(accountId, apiToken, options) {
|
|
await enforceRateLimit();
|
|
return fetchWithRetry(async () => {
|
|
const response = await fetch(`${API_BASE}/accounts/${accountId}/browser-rendering/crawl`, {
|
|
method: "POST",
|
|
headers: {
|
|
Authorization: `Bearer ${apiToken}`,
|
|
"Content-Type": "application/json",
|
|
},
|
|
body: JSON.stringify({
|
|
url: options.url,
|
|
limit: options.limit ?? 10,
|
|
depth: options.depth ?? 1,
|
|
formats: options.formats ?? ["markdown"],
|
|
render: options.render ?? true,
|
|
maxAge: options.maxAge,
|
|
source: options.source ?? "all",
|
|
options: options.options ?? {},
|
|
}),
|
|
});
|
|
if (!response.ok) {
|
|
const error = await response.text();
|
|
const retryAfter = response.headers.get("Retry-After");
|
|
const errorMsg = `Failed to initiate crawl: ${response.status} ${error}${retryAfter ? ` Retry-After: ${retryAfter}` : ""}`;
|
|
throw new Error(errorMsg);
|
|
}
|
|
const data = await response.json();
|
|
if (!data.success) {
|
|
throw new Error(`Crawl initiation failed: ${JSON.stringify(data.errors)}`);
|
|
}
|
|
return data.result.id;
|
|
});
|
|
}
|
|
async function waitForCrawl(accountId, apiToken, jobId, maxAttempts = 120, delayMs = 5000) {
|
|
for (let i = 0; i < maxAttempts; i++) {
|
|
const response = await fetch(`${API_BASE}/accounts/${accountId}/browser-rendering/crawl/${jobId}?limit=1`, {
|
|
headers: {
|
|
Authorization: `Bearer ${apiToken}`,
|
|
},
|
|
});
|
|
if (!response.ok) {
|
|
const error = await response.text();
|
|
throw new Error(`Failed to check crawl status: ${response.status} ${error}`);
|
|
}
|
|
const data = await response.json();
|
|
const status = data.result.status;
|
|
if (status !== "running") {
|
|
return data.result;
|
|
}
|
|
await new Promise((resolve) => setTimeout(resolve, delayMs));
|
|
}
|
|
throw new Error("Crawl job did not complete within timeout");
|
|
}
|
|
function buildCrawlOptions(args, formats) {
|
|
return {
|
|
url: args.url,
|
|
limit: args.limit,
|
|
depth: args.depth,
|
|
formats,
|
|
render: args.render,
|
|
options: {
|
|
includeExternalLinks: args.includeExternalLinks,
|
|
includeSubdomains: args.includeSubdomains,
|
|
includePatterns: args.includePatterns,
|
|
excludePatterns: args.excludePatterns,
|
|
},
|
|
};
|
|
}
|
|
function formatMarkdownResult(result) {
|
|
const records = result.records || [];
|
|
const completedRecords = records.filter((r) => r.status === "completed");
|
|
const content = completedRecords
|
|
.map((record) => {
|
|
const title = record.metadata?.title || record.url;
|
|
return `## ${title}\n\nURL: ${record.url}\n\n${record.markdown || ""}\n\n---\n`;
|
|
})
|
|
.join("\n");
|
|
return `Crawl completed: ${completedRecords.length} of ${result.total} pages crawled successfully.\n\n${content}`;
|
|
}
|
|
function formatHtmlResult(result) {
|
|
const records = result.records || [];
|
|
const completedRecords = records.filter((r) => r.status === "completed");
|
|
const content = completedRecords
|
|
.map((record) => {
|
|
const title = record.metadata?.title || record.url;
|
|
return `<article>\n <h2>${title}</h2>\n <p>Source: <a href="${record.url}">${record.url}</a></p>\n <div class="content">${record.html || ""}</div>\n</article>\n`;
|
|
})
|
|
.join("\n");
|
|
return `Crawl completed: ${completedRecords.length} of ${result.total} pages crawled successfully.\n\n${content}`;
|
|
}
|
|
function formatJsonResult(result) {
|
|
const records = result.records || [];
|
|
const completedRecords = records.filter((r) => r.status === "completed");
|
|
const jsonOutput = {
|
|
summary: {
|
|
total: result.total,
|
|
completed: completedRecords.length,
|
|
status: result.status,
|
|
},
|
|
pages: completedRecords.map((record) => ({
|
|
url: record.url,
|
|
title: record.metadata?.title,
|
|
status: record.metadata?.status,
|
|
markdown: record.markdown,
|
|
html: record.html,
|
|
json: record.json,
|
|
})),
|
|
};
|
|
return JSON.stringify(jsonOutput, null, 2);
|
|
}
|
|
function handleErrorResult(result, jobId) {
|
|
const errorMessages = {
|
|
errored: `Crawl job errored. Job ID: ${jobId}`,
|
|
cancelled_due_to_timeout: `Crawl job cancelled due to timeout (7 days max). Job ID: ${jobId}`,
|
|
cancelled_due_to_limits: `Crawl job cancelled due to account limits. Job ID: ${jobId}`,
|
|
cancelled_by_user: `Crawl job was cancelled by user. Job ID: ${jobId}`,
|
|
};
|
|
const message = errorMessages[result.status] || `Crawl job failed with status: ${result.status}. Job ID: ${jobId}`;
|
|
return {
|
|
content: [{ type: "text", text: message }],
|
|
isError: true,
|
|
};
|
|
}
|
|
const server = new Server({
|
|
name: "cloudflare-crawl-mcp",
|
|
version: "1.0.0",
|
|
}, {
|
|
capabilities: {
|
|
tools: {},
|
|
},
|
|
});
|
|
const baseToolSchema = {
|
|
type: "object",
|
|
properties: {
|
|
url: {
|
|
type: "string",
|
|
description: "The starting URL to crawl",
|
|
},
|
|
limit: {
|
|
type: "number",
|
|
description: "Maximum number of pages to crawl (default: 10, max: 100000)",
|
|
},
|
|
depth: {
|
|
type: "number",
|
|
description: "Maximum link depth to crawl from the starting URL (default: 1)",
|
|
},
|
|
includeSubdomains: {
|
|
type: "boolean",
|
|
description: "If true, follows links to subdomains of the starting URL (default: false)",
|
|
},
|
|
includeExternalLinks: {
|
|
type: "boolean",
|
|
description: "If true, follows links to external domains (default: false)",
|
|
},
|
|
includePatterns: {
|
|
type: "array",
|
|
items: { type: "string" },
|
|
description: "Only visits URLs that match one of these wildcard patterns",
|
|
},
|
|
excludePatterns: {
|
|
type: "array",
|
|
items: { type: "string" },
|
|
description: "Does not visit URLs that match any of these wildcard patterns",
|
|
},
|
|
render: {
|
|
type: "boolean",
|
|
description: "If false, does a fast HTML fetch without executing JavaScript (default: true)",
|
|
},
|
|
},
|
|
required: ["url"],
|
|
};
|
|
const RATE_LIMIT_INFO = `
|
|
---
|
|
**Cloudflare Browser Rendering Limits:**
|
|
|
|
| Plan | Concurrent Browsers | Browser Time | REST API Rate |
|
|
|------|---------------------|--------------|---------------|
|
|
| Free | 3 | 10 min/day | 6 req/min |
|
|
| Paid | 10 | 10 hours/month | 600 req/min |
|
|
|
|
**Environment Variables:**
|
|
- CF_RATE_LIMIT: Override REST API requests per minute (default: 6 for Free, 600 for Paid)
|
|
|
|
**Tips:**
|
|
- Use \`render: false\` for static content to avoid browser time usage
|
|
- Use \`maxAge\` to cache results and reduce API calls
|
|
- Set \`limit\` and \`depth\` appropriately to stay within limits
|
|
---`;
|
|
server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
return {
|
|
tools: [
|
|
{
|
|
name: "crawl_url_markdown",
|
|
description: `Crawl a website using Cloudflare Browser Rendering and return content in Markdown format. Supports following links across the site up to a configurable depth or page limit.${RATE_LIMIT_INFO}`,
|
|
inputSchema: {
|
|
...baseToolSchema,
|
|
properties: {
|
|
...baseToolSchema.properties,
|
|
},
|
|
},
|
|
},
|
|
{
|
|
name: "crawl_url_html",
|
|
description: `Crawl a website using Cloudflare Browser Rendering and return content in HTML format. Supports following links across the site up to a configurable depth or page limit.${RATE_LIMIT_INFO}`,
|
|
inputSchema: baseToolSchema,
|
|
},
|
|
{
|
|
name: "crawl_url_json",
|
|
description: `Crawl a website using Cloudflare Browser Rendering and return content in JSON format. This uses Workers AI for data extraction. Supports following links across the site up to a configurable depth or page limit.${RATE_LIMIT_INFO}`,
|
|
inputSchema: baseToolSchema,
|
|
},
|
|
],
|
|
};
|
|
});
|
|
server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
const { name, arguments: args } = request.params;
|
|
const toolMatch = name.match(/^crawl_url_(markdown|html|json)$/);
|
|
if (!toolMatch) {
|
|
return {
|
|
content: [{ type: "text", text: `Unknown tool: ${name}` }],
|
|
isError: true,
|
|
};
|
|
}
|
|
const format = toolMatch[1];
|
|
const formatMap = {
|
|
markdown: ["markdown"],
|
|
html: ["html"],
|
|
json: ["json"],
|
|
};
|
|
const formats = formatMap[format];
|
|
try {
|
|
const apiToken = getEnv("CF_API_TOKEN");
|
|
const accountId = getEnv("CF_ACCOUNT_ID");
|
|
const crawlArgs = {
|
|
url: args.url,
|
|
limit: args.limit,
|
|
depth: args.depth,
|
|
includeSubdomains: args.includeSubdomains,
|
|
includeExternalLinks: args.includeExternalLinks,
|
|
includePatterns: args.includePatterns,
|
|
excludePatterns: args.excludePatterns,
|
|
render: args.render,
|
|
};
|
|
const options = buildCrawlOptions(crawlArgs, formats);
|
|
const jobId = await initiateCrawl(accountId, apiToken, options);
|
|
const result = await waitForCrawl(accountId, apiToken, jobId);
|
|
const terminalStatuses = ["errored", "cancelled_due_to_timeout", "cancelled_due_to_limits", "cancelled_by_user"];
|
|
if (terminalStatuses.includes(result.status)) {
|
|
return handleErrorResult(result, jobId);
|
|
}
|
|
const formatterMap = {
|
|
markdown: formatMarkdownResult,
|
|
html: formatHtmlResult,
|
|
json: formatJsonResult,
|
|
};
|
|
const formattedContent = formatterMap[format](result);
|
|
return {
|
|
content: [{ type: "text", text: formattedContent }],
|
|
};
|
|
}
|
|
catch (error) {
|
|
const message = error instanceof Error ? error.message : String(error);
|
|
return {
|
|
content: [{ type: "text", text: `Error: ${message}` }],
|
|
isError: true,
|
|
};
|
|
}
|
|
});
|
|
async function main() {
|
|
const transport = new StdioServerTransport();
|
|
await server.connect(transport);
|
|
}
|
|
main().catch((error) => {
|
|
console.error("Server error:", error);
|
|
process.exit(1);
|
|
});
|