Initial commit.

This commit is contained in:
2026-03-11 19:24:29 +00:00
commit 6b47d30961
21 changed files with 9899 additions and 0 deletions
+9
View File
@@ -0,0 +1,9 @@
# Cloudflare API Token (get from https://dash.cloudflare.com/profile/api-tokens)
# Required permissions: Account > Browser Rendering > Edit
CF_API_TOKEN=your_cloudflare_api_token
# Cloudflare Account ID (get from https://dash.cloudflare.com/_/account)
CF_ACCOUNT_ID=your_cloudflare_account_id
# Rate limit: REST API requests per minute (default: 6 for Free, 600 for Paid)
# CF_RATE_LIMIT=6
+43
View File
@@ -0,0 +1,43 @@
name: Test and Release
on:
push:
branches: [main]
pull_request:
branches: [main]
release:
types: [published]
jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-node@v4
with:
node-version: '20'
cache: 'npm'
- run: npm ci
- run: npm run build
- run: npm test
release:
needs: test
runs-on: ubuntu-latest
if: github.event_name == 'release'
steps:
- uses: actions/checkout@v4
- uses: actions/setup-node@v4
with:
node-version: '20'
registry-url: 'https://registry.npmjs.org'
- run: npm ci
- run: npm run build
- name: Run GoReleaser
uses: goreleaser/goreleaser-action@v6
with:
version: latest
args: release --clean
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
NPM_TOKEN: ${{ secrets.NPM_TOKEN }}
+2
View File
@@ -0,0 +1,2 @@
node_modules
.envrc
+51
View File
@@ -0,0 +1,51 @@
# Goreleaser config for @lukaszraczylo/cloudflare-crawl-mcp
# Publishes to npm on release
project_name: cloudflare-crawl-mcp
before:
hooks:
- npm ci
- npm run build
builds:
- id: cloudflare-crawl-mcp
dir: .
env:
- CGO_ENABLED=0
goos:
- linux
- darwin
- windows
goarch:
- amd64
- arm64
archives:
- id: default
format: tarball
snapshot:
name_template: "{{ .Tag }}-next"
changelog:
sort: asc
filters:
exclude:
- "^docs:"
- "^test:"
- "^chore:"
release:
github:
owner: lukaszraczylo
name: cloudflare-crawl-mcp
draft: false
npm:
name: "@lukaszraczylo/cloudflare-crawl-mcp"
channel: "latest"
dir: "."
skip_upload: false
scripts:
post_pack: npm run build
+30
View File
@@ -0,0 +1,30 @@
# Dependencies
node_modules/
# Build output
dist/
release/
# Test coverage
coverage/
# Development
.vscode/
.idea/
*.log
# Git
.git/
.github/
# Config
.env
.env.*
!.env.example
# Misc
*.ts
!*.d.ts
tsconfig.json
vitest.config.ts
.goreleaser.yaml
+2
View File
@@ -0,0 +1,2 @@
github: lukaszraczylo
custom: https://github.com/sponsors/lukaszraczylo
+21
View File
@@ -0,0 +1,21 @@
MIT License
Copyright (c) 2025 Lukasz Raczylo
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
+297
View File
@@ -0,0 +1,297 @@
# @lukaszraczylo/cloudflare-crawl-mcp
<p align="center">
<a href="https://www.npmjs.com/package/@lukaszraczylo/cloudflare-crawl-mcp">
<img src="https://img.shields.io/npm/v/@lukaszraczylo/cloudflare-crawl-mcp" alt="NPM Version">
</a>
<a href="https://github.com/lukaszraczylo/cloudflare-crawl-mcp/blob/main/LICENSE">
<img src="https://img.shields.io/badge/license-MIT-blue.svg" alt="License">
</a>
</p>
MCP server for crawling websites using Cloudflare Browser Rendering API. Supports multiple output formats including Markdown, HTML, and JSON.
## Features
- **Multiple Output Formats**: Choose between Markdown, HTML, or JSON output
- **Configurable Crawling**: Control depth, page limits, and link following
- **Pattern Filtering**: Include/exclude URLs using wildcard patterns
- **JavaScript Rendering**: Execute JavaScript for dynamic content (or disable for static content)
- **Environment-Based Secrets**: Securely manage credentials via environment variables
## Prerequisites
- Node.js 18+
- Cloudflare account with Browser Rendering API access
- Cloudflare API Token with `Browser Rendering` permissions
- Cloudflare Account ID
## Quick Start
```bash
# Clone and setup
npm install
npm run build
# Run with environment variables
CF_API_TOKEN=your_token CF_ACCOUNT_ID=your_account_id npm start
```
## Installation
### 1. Clone the Repository
```bash
git clone https://github.com/lukaszraczylo/cloudflare-crawl-mcp.git
cd cloudflare-crawl-mcp
```
### 2. Install Dependencies
```bash
npm install
```
### 3. Build the Server
```bash
npm run build
```
### 4. Configure Environment Variables
Copy the example environment file and add your credentials:
```bash
cp .env.example .env
```
Edit `.env` with your Cloudflare credentials:
```
CF_API_TOKEN=your_cloudflare_api_token
CF_ACCOUNT_ID=your_cloudflare_account_id
```
#### Getting Cloudflare Credentials
1. **Account ID**: Find it at https://dash.cloudflare.com/_/account
2. **API Token**: Create one at https://dash.cloudflare.com/profile/api-tokens with these permissions:
- `Account` > `Browser Rendering` > `Edit`
## MCP Configuration
### Claude Desktop (macOS)
Add to `~/Library/Application Support/Claude/claude_desktop_config.json`:
```json
{
"mcpServers": {
"cloudflare-crawl": {
"command": "npm",
"args": ["start"],
"env": {
"CF_API_TOKEN": "your_api_token",
"CF_ACCOUNT_ID": "your_account_id"
},
"path": "/path/to/cloudflare-crawl-mcp"
}
}
}
```
### Claude Code (CLI)
```json
{
"mcpServers": {
"cloudflare-crawl": {
"command": "npm",
"args": ["start"],
"env": {
"CF_API_TOKEN": "your_api_token",
"CF_ACCOUNT_ID": "your_account_id"
}
}
}
}
```
### Cursor
Add to `~/.cursor/settings.json` (MCP configuration):
```json
{
"mcpServers": {
"cloudflare-crawl": {
"command": "npm",
"args": ["start"],
"env": {
"CF_API_TOKEN": "your_api_token",
"CF_ACCOUNT_ID": "your_account_id"
},
"path": "/path/to/cloudflare-crawl-mcp"
}
}
}
```
## Available Tools
### crawl_url_markdown
Crawl a website and return content in **Markdown** format.
```typescript
{
"name": "crawl_url_markdown",
"arguments": {
"url": "https://example.com/docs",
"limit": 50,
"depth": 2,
"includePatterns": ["https://example.com/docs/**"],
"excludePatterns": ["https://example.com/docs/archive/**"],
"render": true
}
}
```
### crawl_url_html
Crawl a website and return content in **HTML** format.
```typescript
{
"name": "crawl_url_html",
"arguments": {
"url": "https://example.com",
"limit": 10
}
}
```
### crawl_url_json
Crawl a website and return content in **JSON** format (uses Workers AI for data extraction).
```typescript
{
"name": "crawl_url_json",
"arguments": {
"url": "https://example.com/products",
"limit": 20
}
}
```
## Parameters
| Parameter | Type | Default | Description |
|-----------|------|---------|-------------|
| `url` | string | required | Starting URL to crawl |
| `limit` | number | 10 | Maximum pages to crawl (max: 100,000) |
| `depth` | number | 1 | Maximum link depth from starting URL |
| `includeSubdomains` | boolean | false | Follow links to subdomains |
| `includeExternalLinks` | boolean | false | Follow links to external domains |
| `includePatterns` | string[] | [] | Wildcard patterns to include |
| `excludePatterns` | string[] | [] | Wildcard patterns to exclude |
| `render` | boolean | true | Execute JavaScript (false = faster static fetch) |
### Pattern Syntax
- `*` - Matches any characters except `/`
- `**` - Matches any characters including `/`
Examples:
- `https://example.com/docs/**` - All URLs under /docs
- `https://example.com/*.html` - All HTML files directly in root
## Development
### Commands
```bash
npm run build # Build TypeScript
npm start # Run server
npm test # Run tests
npm run test:watch # Run tests in watch mode
```
### Testing
The project includes comprehensive tests covering:
- Environment variable handling
- Crawl options building
- Result formatting (Markdown, HTML, JSON)
- Error handling
- API integration
Run tests:
```bash
npm test
```
## Architecture
```
src/
├── index.ts # Main MCP server implementation
├── API Layer
│ ├── initiateCrawl() # POST to /crawl endpoint
│ ├── waitForCrawl() # Poll for job completion
│ └── getCrawlResults() # Fetch final results
├── Formatters
│ ├── formatMarkdownResult()
│ ├── formatHtmlResult()
│ └── formatJsonResult()
└── MCP Handlers
├── ListToolsRequestSchema # Tool registration
└── CallToolRequestSchema # Tool execution
```
## Cloudflare Limits
- **Max crawl duration**: 7 days
- **Results available**: 14 days after completion
- **Max pages per job**: 100,000
- **Free plan**: 10 minutes of browser time per day
See [Cloudflare Browser Rendering Limits](https://developers.cloudflare.com/browser-rendering/limits/) for details.
## Troubleshooting
### Crawl returns no results
- Check `robots.txt` blocking (use `render: false` to bypass)
- Verify `includePatterns` match actual URLs
- Try increasing `depth` or disabling pattern filters
### Job cancelled due to limits
- Upgrade to Workers Paid plan
- Use `render: false` for static content
- Reduce `limit` parameter
### Authentication errors
- Verify API Token has Browser Rendering permissions
- Confirm Account ID is correct
## License
MIT License - see [LICENSE](LICENSE) file.
## Contributing
Contributions are welcome! Please read our contributing guidelines before submitting PRs at https://github.com/lukaszraczylo/cloudflare-crawl-mcp.
## Support
- Open an issue at https://github.com/lukaszraczylo/cloudflare-crawl-mcp/issues
- Check Cloudflare's [Browser Rendering Docs](https://developers.cloudflare.com/browser-rendering/) for API details
+1
View File
@@ -0,0 +1 @@
export {};
+339
View File
@@ -0,0 +1,339 @@
import { Server } from "@modelcontextprotocol/sdk/server/index.js";
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
import { CallToolRequestSchema, ListToolsRequestSchema, } from "@modelcontextprotocol/sdk/types.js";
const API_BASE = "https://api.cloudflare.com/client/v4";
const MAX_RETRIES = 3;
const RATE_LIMIT_DELAY_MS = 10000;
let lastRequestTime = 0;
let requestCount = 0;
let windowStart = Date.now();
function getEnv(key) {
const value = process.env[key];
if (!value) {
throw new Error(`Missing required environment variable: ${key}`);
}
return value;
}
async function enforceRateLimit() {
const now = Date.now();
const windowDuration = 60000;
if (now - windowStart >= windowDuration) {
requestCount = 0;
windowStart = now;
}
const requestsPerMinute = parseInt(process.env.CF_RATE_LIMIT || "6", 10);
if (requestCount >= requestsPerMinute) {
const waitTime = windowDuration - (now - windowStart);
console.error(`Rate limit reached (${requestsPerMinute}/min). Waiting ${waitTime}ms...`);
await new Promise((resolve) => setTimeout(resolve, waitTime));
requestCount = 0;
windowStart = Date.now();
}
const timeSinceLastRequest = now - lastRequestTime;
if (timeSinceLastRequest < RATE_LIMIT_DELAY_MS && requestCount > 0) {
const waitTime = RATE_LIMIT_DELAY_MS - timeSinceLastRequest;
await new Promise((resolve) => setTimeout(resolve, waitTime));
}
lastRequestTime = Date.now();
requestCount++;
}
async function fetchWithRetry(fn, retries = MAX_RETRIES) {
let lastError = null;
for (let attempt = 0; attempt < retries; attempt++) {
try {
return await fn();
}
catch (error) {
lastError = error;
const errorStr = error.message || "";
const isRateLimit = errorStr.includes("429") ||
errorStr.includes("Rate limit");
if (!isRateLimit || attempt === retries - 1) {
throw error;
}
const retryAfterMatch = errorStr.match(/Retry-After[:\s]*(\d+)/i);
const delay = retryAfterMatch
? parseInt(retryAfterMatch[1], 10) * 1000
: Math.min(1000 * Math.pow(2, attempt), 30000);
console.error(`Rate limited. Retrying in ${delay}ms...`);
await new Promise((resolve) => setTimeout(resolve, delay));
}
}
throw lastError;
}
async function initiateCrawl(accountId, apiToken, options) {
await enforceRateLimit();
return fetchWithRetry(async () => {
const response = await fetch(`${API_BASE}/accounts/${accountId}/browser-rendering/crawl`, {
method: "POST",
headers: {
Authorization: `Bearer ${apiToken}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
url: options.url,
limit: options.limit ?? 10,
depth: options.depth ?? 1,
formats: options.formats ?? ["markdown"],
render: options.render ?? true,
maxAge: options.maxAge,
source: options.source ?? "all",
options: options.options ?? {},
}),
});
if (!response.ok) {
const error = await response.text();
const retryAfter = response.headers.get("Retry-After");
const errorMsg = `Failed to initiate crawl: ${response.status} ${error}${retryAfter ? ` Retry-After: ${retryAfter}` : ""}`;
throw new Error(errorMsg);
}
const data = await response.json();
if (!data.success) {
throw new Error(`Crawl initiation failed: ${JSON.stringify(data.errors)}`);
}
return data.result.id;
});
}
async function waitForCrawl(accountId, apiToken, jobId, maxAttempts = 120, delayMs = 5000) {
for (let i = 0; i < maxAttempts; i++) {
const response = await fetch(`${API_BASE}/accounts/${accountId}/browser-rendering/crawl/${jobId}?limit=1`, {
headers: {
Authorization: `Bearer ${apiToken}`,
},
});
if (!response.ok) {
const error = await response.text();
throw new Error(`Failed to check crawl status: ${response.status} ${error}`);
}
const data = await response.json();
const status = data.result.status;
if (status !== "running") {
return data.result;
}
await new Promise((resolve) => setTimeout(resolve, delayMs));
}
throw new Error("Crawl job did not complete within timeout");
}
function buildCrawlOptions(args, formats) {
return {
url: args.url,
limit: args.limit,
depth: args.depth,
formats,
render: args.render,
options: {
includeExternalLinks: args.includeExternalLinks,
includeSubdomains: args.includeSubdomains,
includePatterns: args.includePatterns,
excludePatterns: args.excludePatterns,
},
};
}
function formatMarkdownResult(result) {
const records = result.records || [];
const completedRecords = records.filter((r) => r.status === "completed");
const content = completedRecords
.map((record) => {
const title = record.metadata?.title || record.url;
return `## ${title}\n\nURL: ${record.url}\n\n${record.markdown || ""}\n\n---\n`;
})
.join("\n");
return `Crawl completed: ${completedRecords.length} of ${result.total} pages crawled successfully.\n\n${content}`;
}
function formatHtmlResult(result) {
const records = result.records || [];
const completedRecords = records.filter((r) => r.status === "completed");
const content = completedRecords
.map((record) => {
const title = record.metadata?.title || record.url;
return `<article>\n <h2>${title}</h2>\n <p>Source: <a href="${record.url}">${record.url}</a></p>\n <div class="content">${record.html || ""}</div>\n</article>\n`;
})
.join("\n");
return `Crawl completed: ${completedRecords.length} of ${result.total} pages crawled successfully.\n\n${content}`;
}
function formatJsonResult(result) {
const records = result.records || [];
const completedRecords = records.filter((r) => r.status === "completed");
const jsonOutput = {
summary: {
total: result.total,
completed: completedRecords.length,
status: result.status,
},
pages: completedRecords.map((record) => ({
url: record.url,
title: record.metadata?.title,
status: record.metadata?.status,
markdown: record.markdown,
html: record.html,
json: record.json,
})),
};
return JSON.stringify(jsonOutput, null, 2);
}
function handleErrorResult(result, jobId) {
const errorMessages = {
errored: `Crawl job errored. Job ID: ${jobId}`,
cancelled_due_to_timeout: `Crawl job cancelled due to timeout (7 days max). Job ID: ${jobId}`,
cancelled_due_to_limits: `Crawl job cancelled due to account limits. Job ID: ${jobId}`,
cancelled_by_user: `Crawl job was cancelled by user. Job ID: ${jobId}`,
};
const message = errorMessages[result.status] || `Crawl job failed with status: ${result.status}. Job ID: ${jobId}`;
return {
content: [{ type: "text", text: message }],
isError: true,
};
}
const server = new Server({
name: "cloudflare-crawl-mcp",
version: "1.0.0",
}, {
capabilities: {
tools: {},
},
});
const baseToolSchema = {
type: "object",
properties: {
url: {
type: "string",
description: "The starting URL to crawl",
},
limit: {
type: "number",
description: "Maximum number of pages to crawl (default: 10, max: 100000)",
},
depth: {
type: "number",
description: "Maximum link depth to crawl from the starting URL (default: 1)",
},
includeSubdomains: {
type: "boolean",
description: "If true, follows links to subdomains of the starting URL (default: false)",
},
includeExternalLinks: {
type: "boolean",
description: "If true, follows links to external domains (default: false)",
},
includePatterns: {
type: "array",
items: { type: "string" },
description: "Only visits URLs that match one of these wildcard patterns",
},
excludePatterns: {
type: "array",
items: { type: "string" },
description: "Does not visit URLs that match any of these wildcard patterns",
},
render: {
type: "boolean",
description: "If false, does a fast HTML fetch without executing JavaScript (default: true)",
},
},
required: ["url"],
};
const RATE_LIMIT_INFO = `
---
**Cloudflare Browser Rendering Limits:**
| Plan | Concurrent Browsers | Browser Time | REST API Rate |
|------|---------------------|--------------|---------------|
| Free | 3 | 10 min/day | 6 req/min |
| Paid | 10 | 10 hours/month | 600 req/min |
**Environment Variables:**
- CF_RATE_LIMIT: Override REST API requests per minute (default: 6 for Free, 600 for Paid)
**Tips:**
- Use \`render: false\` for static content to avoid browser time usage
- Use \`maxAge\` to cache results and reduce API calls
- Set \`limit\` and \`depth\` appropriately to stay within limits
---`;
server.setRequestHandler(ListToolsRequestSchema, async () => {
return {
tools: [
{
name: "crawl_url_markdown",
description: `Crawl a website using Cloudflare Browser Rendering and return content in Markdown format. Supports following links across the site up to a configurable depth or page limit.${RATE_LIMIT_INFO}`,
inputSchema: {
...baseToolSchema,
properties: {
...baseToolSchema.properties,
},
},
},
{
name: "crawl_url_html",
description: `Crawl a website using Cloudflare Browser Rendering and return content in HTML format. Supports following links across the site up to a configurable depth or page limit.${RATE_LIMIT_INFO}`,
inputSchema: baseToolSchema,
},
{
name: "crawl_url_json",
description: `Crawl a website using Cloudflare Browser Rendering and return content in JSON format. This uses Workers AI for data extraction. Supports following links across the site up to a configurable depth or page limit.${RATE_LIMIT_INFO}`,
inputSchema: baseToolSchema,
},
],
};
});
server.setRequestHandler(CallToolRequestSchema, async (request) => {
const { name, arguments: args } = request.params;
const toolMatch = name.match(/^crawl_url_(markdown|html|json)$/);
if (!toolMatch) {
return {
content: [{ type: "text", text: `Unknown tool: ${name}` }],
isError: true,
};
}
const format = toolMatch[1];
const formatMap = {
markdown: ["markdown"],
html: ["html"],
json: ["json"],
};
const formats = formatMap[format];
try {
const apiToken = getEnv("CF_API_TOKEN");
const accountId = getEnv("CF_ACCOUNT_ID");
const crawlArgs = {
url: args.url,
limit: args.limit,
depth: args.depth,
includeSubdomains: args.includeSubdomains,
includeExternalLinks: args.includeExternalLinks,
includePatterns: args.includePatterns,
excludePatterns: args.excludePatterns,
render: args.render,
};
const options = buildCrawlOptions(crawlArgs, formats);
const jobId = await initiateCrawl(accountId, apiToken, options);
const result = await waitForCrawl(accountId, apiToken, jobId);
const terminalStatuses = ["errored", "cancelled_due_to_timeout", "cancelled_due_to_limits", "cancelled_by_user"];
if (terminalStatuses.includes(result.status)) {
return handleErrorResult(result, jobId);
}
const formatterMap = {
markdown: formatMarkdownResult,
html: formatHtmlResult,
json: formatJsonResult,
};
const formattedContent = formatterMap[format](result);
return {
content: [{ type: "text", text: formattedContent }],
};
}
catch (error) {
const message = error instanceof Error ? error.message : String(error);
return {
content: [{ type: "text", text: `Error: ${message}` }],
isError: true,
};
}
});
async function main() {
const transport = new StdioServerTransport();
await server.connect(transport);
}
main().catch((error) => {
console.error("Server error:", error);
process.exit(1);
});
+1
View File
@@ -0,0 +1 @@
export {};
+628
View File
@@ -0,0 +1,628 @@
import { describe, it, expect, beforeEach, vi } from 'vitest';
const API_BASE = "https://api.cloudflare.com/client/v4";
function getEnv(key) {
const value = process.env[key];
if (!value) {
throw new Error(`Missing required environment variable: ${key}`);
}
return value;
}
async function initiateCrawl(accountId, apiToken, options) {
const response = await fetch(`${API_BASE}/accounts/${accountId}/browser-rendering/crawl`, {
method: "POST",
headers: {
Authorization: `Bearer ${apiToken}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
url: options.url,
limit: options.limit ?? 10,
depth: options.depth ?? 1,
formats: options.formats ?? ["markdown"],
render: options.render ?? true,
maxAge: options.maxAge,
source: options.source ?? "all",
options: options.options ?? {},
}),
});
if (!response.ok) {
const error = await response.text();
throw new Error(`Failed to initiate crawl: ${response.status} ${error}`);
}
const data = await response.json();
if (!data.success) {
throw new Error(`Crawl initiation failed: ${JSON.stringify(data.errors)}`);
}
return data.result.id;
}
async function waitForCrawl(accountId, apiToken, jobId, maxAttempts = 120, delayMs = 5000) {
for (let i = 0; i < maxAttempts; i++) {
const response = await fetch(`${API_BASE}/accounts/${accountId}/browser-rendering/crawl/${jobId}?limit=1`, {
headers: {
Authorization: `Bearer ${apiToken}`,
},
});
if (!response.ok) {
const error = await response.text();
throw new Error(`Failed to check crawl status: ${response.status} ${error}`);
}
const data = await response.json();
const status = data.result.status;
if (status !== "running") {
return data.result;
}
await new Promise((resolve) => setTimeout(resolve, delayMs));
}
throw new Error("Crawl job did not complete within timeout");
}
function buildCrawlOptions(args, formats) {
return {
url: args.url,
limit: args.limit,
depth: args.depth,
formats,
render: args.render,
options: {
includeExternalLinks: args.includeExternalLinks,
includeSubdomains: args.includeSubdomains,
includePatterns: args.includePatterns,
excludePatterns: args.excludePatterns,
},
};
}
function formatMarkdownResult(result) {
const records = result.records || [];
const completedRecords = records.filter((r) => r.status === "completed");
const content = completedRecords
.map((record) => {
const title = record.metadata?.title || record.url;
return `## ${title}\n\nURL: ${record.url}\n\n${record.markdown || ""}\n\n---\n`;
})
.join("\n");
return `Crawl completed: ${completedRecords.length} of ${result.total} pages crawled successfully.\n\n${content}`;
}
function formatHtmlResult(result) {
const records = result.records || [];
const completedRecords = records.filter((r) => r.status === "completed");
const content = completedRecords
.map((record) => {
const title = record.metadata?.title || record.url;
return `<article>\n <h2>${title}</h2>\n <p>Source: <a href="${record.url}">${record.url}</a></p>\n <div class="content">${record.html || ""}</div>\n</article>\n`;
})
.join("\n");
return `Crawl completed: ${completedRecords.length} of ${result.total} pages crawled successfully.\n\n${content}`;
}
function formatJsonResult(result) {
const records = result.records || [];
const completedRecords = records.filter((r) => r.status === "completed");
const jsonOutput = {
summary: {
total: result.total,
completed: completedRecords.length,
status: result.status,
},
pages: completedRecords.map((record) => ({
url: record.url,
title: record.metadata?.title,
status: record.metadata?.status,
markdown: record.markdown,
html: record.html,
json: record.json,
})),
};
return JSON.stringify(jsonOutput, null, 2);
}
function handleErrorResult(result, jobId) {
const errorMessages = {
errored: `Crawl job errored. Job ID: ${jobId}`,
cancelled_due_to_timeout: `Crawl job cancelled due to timeout (7 days max). Job ID: ${jobId}`,
cancelled_due_to_limits: `Crawl job cancelled due to account limits. Job ID: ${jobId}`,
cancelled_by_user: `Crawl job was cancelled by user. Job ID: ${jobId}`,
};
const message = errorMessages[result.status] || `Crawl job failed with status: ${result.status}. Job ID: ${jobId}`;
return {
content: [{ type: "text", text: message }],
isError: true,
};
}
describe('getEnv', () => {
const testCases = [
{
name: 'returns value when env var exists',
envKey: 'TEST_VAR',
envValue: 'test-value',
expected: 'test-value',
},
{
name: 'throws when env var is empty string',
envKey: 'EMPTY_VAR',
envValue: '',
expectedError: 'Missing required environment variable: EMPTY_VAR',
},
{
name: 'throws when env var is undefined',
envKey: 'UNDEFINED_VAR',
envValue: undefined,
expectedError: 'Missing required environment variable: UNDEFINED_VAR',
},
];
it.each(testCases)('$name', ({ envKey, envValue, expected, expectedError }) => {
if (expectedError) {
if (envValue === undefined) {
delete process.env[envKey];
}
else {
process.env[envKey] = envValue;
}
expect(() => getEnv(envKey)).toThrow(expectedError);
}
else {
process.env[envKey] = envValue;
expect(getEnv(envKey)).toBe(expected);
}
});
});
describe('buildCrawlOptions', () => {
const testCases = [
{
name: 'builds options with markdown format',
args: { url: 'https://example.com' },
formats: ['markdown'],
expected: {
url: 'https://example.com',
limit: undefined,
depth: undefined,
formats: ['markdown'],
render: undefined,
options: {
includeExternalLinks: undefined,
includeSubdomains: undefined,
includePatterns: undefined,
excludePatterns: undefined,
},
},
},
{
name: 'builds options with all parameters',
args: {
url: 'https://example.com',
limit: 50,
depth: 2,
includeSubdomains: true,
includeExternalLinks: false,
includePatterns: ['**/docs/**'],
excludePatterns: ['**/archive/**'],
render: true,
},
formats: ['html'],
expected: {
url: 'https://example.com',
limit: 50,
depth: 2,
formats: ['html'],
render: true,
options: {
includeExternalLinks: false,
includeSubdomains: true,
includePatterns: ['**/docs/**'],
excludePatterns: ['**/archive/**'],
},
},
},
{
name: 'builds options with json format',
args: { url: 'https://api.example.com', limit: 100 },
formats: ['json'],
expected: {
url: 'https://api.example.com',
limit: 100,
formats: ['json'],
depth: undefined,
render: undefined,
options: {
includeExternalLinks: undefined,
includeSubdomains: undefined,
includePatterns: undefined,
excludePatterns: undefined,
},
},
},
{
name: 'handles empty options object',
args: { url: 'https://test.com' },
formats: ['markdown'],
expected: {
url: 'https://test.com',
formats: ['markdown'],
options: {
includeExternalLinks: undefined,
includeSubdomains: undefined,
includePatterns: undefined,
excludePatterns: undefined,
},
},
},
];
it.each(testCases)('$name', ({ args, formats, expected }) => {
const result = buildCrawlOptions(args, formats);
expect(result).toEqual(expected);
});
});
describe('formatMarkdownResult', () => {
const testCases = [
{
name: 'formats single completed page',
result: {
total: 1,
status: 'completed',
records: [
{
url: 'https://example.com',
status: 'completed',
markdown: '# Hello World',
metadata: { title: 'Home Page', status: 200 },
},
],
},
expectedContains: ['## Home Page', '# Hello World', 'Crawl completed: 1 of 1'],
},
{
name: 'formats multiple completed pages',
result: {
total: 2,
status: 'completed',
records: [
{
url: 'https://example.com',
status: 'completed',
markdown: '# Page 1',
metadata: { title: 'Page One', status: 200 },
},
{
url: 'https://example.com/about',
status: 'completed',
markdown: '# About Us',
metadata: { title: 'About', status: 200 },
},
],
},
expectedContains: ['## Page One', '## About', 'Crawl completed: 2 of 2'],
},
{
name: 'handles missing markdown content',
result: {
total: 1,
status: 'completed',
records: [
{
url: 'https://example.com',
status: 'completed',
markdown: '',
metadata: { title: 'Test', status: 200 },
},
],
},
expectedContains: ['## Test', 'URL: https://example.com'],
},
{
name: 'uses url as title when metadata.title is missing',
result: {
total: 1,
status: 'completed',
records: [
{
url: 'https://example.com/unnamed',
status: 'completed',
markdown: 'Content here',
},
],
},
expectedContains: ['## https://example.com/unnamed', 'Content here'],
},
{
name: 'handles empty records array',
result: {
total: 0,
status: 'completed',
records: [],
},
expectedContains: ['Crawl completed: 0 of 0'],
},
{
name: 'filters out non-completed records',
result: {
total: 3,
status: 'completed',
records: [
{ url: 'https://example.com/1', status: 'completed', markdown: '# Done' },
{ url: 'https://example.com/2', status: 'errored', markdown: '# Failed' },
{ url: 'https://example.com/3', status: 'skipped' },
],
},
expectedContains: ['Crawl completed: 1 of 3', '# Done'],
},
];
it.each(testCases)('$name', ({ result, expectedContains }) => {
const output = formatMarkdownResult(result);
expectedContains.forEach((expected) => {
expect(output).toContain(expected);
});
});
});
describe('formatHtmlResult', () => {
const testCases = [
{
name: 'formats single completed page with HTML',
result: {
total: 1,
status: 'completed',
records: [
{
url: 'https://example.com',
status: 'completed',
html: '<p>Hello World</p>',
metadata: { title: 'Home Page', status: 200 },
},
],
},
expectedContains: ['<h2>Home Page</h2>', '<p>Hello World</p>', 'Crawl completed: 1 of 1'],
},
{
name: 'formats multiple completed pages',
result: {
total: 2,
status: 'completed',
records: [
{
url: 'https://example.com',
status: 'completed',
html: '<div>Page 1</div>',
metadata: { title: 'Page One', status: 200 },
},
{
url: 'https://example.com/about',
status: 'completed',
html: '<div>About Us</div>',
metadata: { title: 'About', status: 200 },
},
],
},
expectedContains: ['<h2>Page One</h2>', '<h2>About</h2>', 'Crawl completed: 2 of 2'],
},
{
name: 'handles missing HTML content',
result: {
total: 1,
status: 'completed',
records: [
{
url: 'https://example.com',
status: 'completed',
html: '',
metadata: { title: 'Test', status: 200 },
},
],
},
expectedContains: ['<h2>Test</h2>', '<a href="https://example.com">'],
},
];
it.each(testCases)('$name', ({ result, expectedContains }) => {
const output = formatHtmlResult(result);
expectedContains.forEach((expected) => {
expect(output).toContain(expected);
});
});
});
describe('formatJsonResult', () => {
const testCases = [
{
name: 'formats single completed page as JSON',
result: {
total: 1,
status: 'completed',
records: [
{
url: 'https://example.com',
status: 'completed',
markdown: '# Hello',
html: '<h1>Hello</h1>',
json: { key: 'value' },
metadata: { title: 'Home', status: 200 },
},
],
},
},
{
name: 'formats multiple completed pages as JSON',
result: {
total: 2,
status: 'completed',
records: [
{
url: 'https://example.com/page1',
status: 'completed',
markdown: '# Page 1',
},
{
url: 'https://example.com/page2',
status: 'completed',
markdown: '# Page 2',
},
],
},
},
{
name: 'includes summary with correct counts',
result: {
total: 5,
status: 'completed',
records: [
{ url: 'https://example.com/1', status: 'completed' },
{ url: 'https://example.com/2', status: 'completed' },
{ url: 'https://example.com/3', status: 'errored' },
{ url: 'https://example.com/4', status: 'skipped' },
{ url: 'https://example.com/5', status: 'completed' },
],
},
},
{
name: 'handles empty records',
result: {
total: 0,
status: 'completed',
records: [],
},
},
];
it.each(testCases)('$name', ({ result }) => {
const output = formatJsonResult(result);
const parsed = JSON.parse(output);
expect(parsed).toHaveProperty('summary');
expect(parsed).toHaveProperty('pages');
const completedCount = result.records.filter((r) => r.status === 'completed').length;
expect(parsed.summary.completed).toBe(completedCount);
expect(parsed.summary.total).toBe(result.total);
expect(parsed.summary.status).toBe(result.status);
});
});
describe('handleErrorResult', () => {
const testCases = [
{
name: 'handles errored status',
result: { status: 'errored' },
jobId: 'test-job-123',
expectedError: true,
expectedContains: ['errored', 'test-job-123'],
},
{
name: 'handles cancelled_due_to_timeout status',
result: { status: 'cancelled_due_to_timeout' },
jobId: 'job-456',
expectedError: true,
expectedContains: ['timeout', 'job-456'],
},
{
name: 'handles cancelled_due_to_limits status',
result: { status: 'cancelled_due_to_limits' },
jobId: 'job-789',
expectedError: true,
expectedContains: ['limits', 'job-789'],
},
{
name: 'handles cancelled_by_user status',
result: { status: 'cancelled_by_user' },
jobId: 'job-000',
expectedError: true,
expectedContains: ['cancelled by user', 'job-000'],
},
{
name: 'handles unknown status',
result: { status: 'some_unknown_status' },
jobId: 'job-unknown',
expectedError: true,
expectedContains: ['some_unknown_status', 'job-unknown'],
},
];
it.each(testCases)('$name', ({ result, jobId, expectedError, expectedContains }) => {
const output = handleErrorResult(result, jobId);
expect(output.isError).toBe(expectedError);
expectedContains.forEach((expected) => {
expect(output.content[0].text).toContain(expected);
});
});
});
describe('initiateCrawl', () => {
beforeEach(() => {
vi.stubGlobal('fetch', vi.fn());
});
const testCases = [
{
name: 'initiates crawl successfully',
accountId: 'acc-123',
apiToken: 'token-abc',
options: { url: 'https://example.com', formats: ['markdown'] },
mockResponse: { success: true, result: { id: 'job-123' } },
expectedJobId: 'job-123',
},
{
name: 'throws on HTTP error',
accountId: 'acc-123',
apiToken: 'token-abc',
options: { url: 'https://example.com' },
mockResponse: null,
mockStatus: 401,
expectedError: 'Failed to initiate crawl: 401',
},
{
name: 'throws on API failure',
accountId: 'acc-123',
apiToken: 'token-abc',
options: { url: 'https://example.com' },
mockResponse: { success: false, errors: [{ message: 'Invalid URL' }] },
expectedError: 'Crawl initiation failed',
},
];
it.each(testCases)('$name', async ({ accountId, apiToken, options, mockResponse, mockStatus, expectedJobId, expectedError }) => {
const fetchMock = vi.mocked(fetch);
if (expectedError) {
if (mockStatus) {
fetchMock.mockResolvedValueOnce(new Response('', { status: mockStatus }));
}
else {
fetchMock.mockResolvedValueOnce(new Response(JSON.stringify(mockResponse), {
status: 200,
headers: { 'Content-Type': 'application/json' }
}));
}
await expect(initiateCrawl(accountId, apiToken, options)).rejects.toThrow(expectedError);
}
else {
fetchMock.mockResolvedValueOnce(new Response(JSON.stringify(mockResponse), {
status: 200,
headers: { 'Content-Type': 'application/json' }
}));
const result = await initiateCrawl(accountId, apiToken, options);
expect(result).toBe(expectedJobId);
}
});
});
describe('waitForCrawl', () => {
beforeEach(() => {
vi.stubGlobal('fetch', vi.fn());
});
const testCases = [
{
name: 'returns completed result immediately',
accountId: 'acc-123',
apiToken: 'token-abc',
jobId: 'job-123',
mockResponse: { result: { status: 'completed', total: 5, records: [] } },
expectedStatus: 'completed',
},
{
name: 'returns errored result',
accountId: 'acc-123',
apiToken: 'token-abc',
jobId: 'job-123',
mockResponse: { result: { status: 'errored', error: 'Something went wrong' } },
expectedStatus: 'errored',
},
{
name: 'returns cancelled_due_to_limits result',
accountId: 'acc-123',
apiToken: 'token-abc',
jobId: 'job-123',
mockResponse: { result: { status: 'cancelled_due_to_limits' } },
expectedStatus: 'cancelled_due_to_limits',
},
];
it.each(testCases)('$name', async ({ accountId, apiToken, jobId, mockResponse, expectedStatus }) => {
const fetchMock = vi.mocked(fetch);
fetchMock.mockResolvedValue(new Response(JSON.stringify(mockResponse), {
status: 200,
headers: { 'Content-Type': 'application/json' }
}));
const result = await waitForCrawl(accountId, apiToken, jobId, 1, 1);
expect(result.status).toBe(expectedStatus);
});
});
+1
View File
@@ -0,0 +1 @@
export {};
+134
View File
@@ -0,0 +1,134 @@
import { describe, it, expect, beforeAll } from 'vitest';
const API_BASE = "https://api.cloudflare.com/client/v4";
async function initiateCrawl(accountId, apiToken, options) {
const response = await fetch(`${API_BASE}/accounts/${accountId}/browser-rendering/crawl`, {
method: "POST",
headers: {
Authorization: `Bearer ${apiToken}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
url: options.url,
limit: options.limit ?? 10,
depth: options.depth ?? 1,
formats: options.formats ?? ["markdown"],
render: options.render ?? true,
options: options.options ?? {},
}),
});
if (!response.ok) {
const error = await response.text();
throw new Error(`Failed to initiate crawl: ${response.status} ${error}`);
}
const data = await response.json();
if (!data.success) {
throw new Error(`Crawl initiation failed: ${JSON.stringify(data.errors)}`);
}
return data.result.id;
}
async function waitForCrawl(accountId, apiToken, jobId, maxAttempts = 60, delayMs = 5000) {
for (let i = 0; i < maxAttempts; i++) {
const response = await fetch(`${API_BASE}/accounts/${accountId}/browser-rendering/crawl/${jobId}?limit=1`, {
headers: {
Authorization: `Bearer ${apiToken}`,
},
});
if (!response.ok) {
const error = await response.text();
throw new Error(`Failed to check crawl status: ${response.status} ${error}`);
}
const data = await response.json();
const status = data.result.status;
if (status !== "running") {
return data.result;
}
await new Promise((resolve) => setTimeout(resolve, delayMs));
}
throw new Error("Crawl job did not complete within timeout");
}
function getEnv(key) {
const value = process.env[key];
if (!value) {
throw new Error(`Missing required environment variable: ${key}`);
}
return value;
}
describe('Integration: Cloudflare Crawl API', () => {
const apiToken = process.env.CF_API_TOKEN;
const accountId = process.env.CF_ACCOUNT_ID;
const hasCredentials = apiToken && accountId;
beforeAll(() => {
if (!hasCredentials) {
console.log('\n⚠️ Skipping integration tests - CF_API_TOKEN or CF_ACCOUNT_ID not set\n');
}
});
it.skipIf(!hasCredentials)('should crawl raczylo.com with multiple pages in markdown format', async () => {
const accountId = getEnv("CF_ACCOUNT_ID");
const apiToken = getEnv("CF_API_TOKEN");
try {
const jobId = await initiateCrawl(accountId, apiToken, {
url: "https://raczylo.com",
limit: 5,
depth: 2,
formats: ["markdown"],
});
console.log(` Started crawl job: ${jobId}`);
expect(jobId).toBeDefined();
expect(typeof jobId).toBe("string");
const result = await waitForCrawl(accountId, apiToken, jobId, 60, 5000);
console.log(` Crawl status: ${result.status}`);
console.log(` Total pages discovered: ${result.total}`);
console.log(` Pages finished: ${result.finished}`);
expect(result.status).toBe("completed");
expect(result.total).toBeGreaterThan(0);
expect(result.records).toBeDefined();
expect(Array.isArray(result.records)).toBe(true);
expect(result.records.length).toBeGreaterThan(0);
const completedRecords = result.records.filter((r) => r.status === "completed");
console.log(` Completed pages: ${completedRecords.length}`);
completedRecords.forEach((record, index) => {
expect(record.url).toBeDefined();
expect(record.markdown).toBeDefined();
expect(record.markdown.length).toBeGreaterThan(0);
console.log(` Page ${index + 1}: ${record.url} (${record.markdown.length} chars)`);
});
const firstRecord = result.records[0];
expect(firstRecord.markdown).toContain("#");
}
catch (error) {
if (error.message.includes("Rate limit")) {
console.log(" ⚠️ Skipped - Rate limit exceeded");
return;
}
throw error;
}
}, 360000);
});
describe('Environment Variable Validation', () => {
const testCases = [
{
name: 'CF_API_TOKEN is required',
envKey: 'CF_API_TOKEN',
expectedError: 'Missing required environment variable: CF_API_TOKEN',
},
{
name: 'CF_ACCOUNT_ID is required',
envKey: 'CF_ACCOUNT_ID',
expectedError: 'Missing required environment variable: CF_ACCOUNT_ID',
},
];
it.each(testCases)('$name', ({ envKey, expectedError }) => {
delete process.env[envKey];
expect(() => getEnv(envKey)).toThrow(expectedError);
});
it('should return value when CF_API_TOKEN is set', () => {
process.env.CF_API_TOKEN = 'test-token';
expect(getEnv('CF_API_TOKEN')).toBe('test-token');
delete process.env.CF_API_TOKEN;
});
it('should return value when CF_ACCOUNT_ID is set', () => {
process.env.CF_ACCOUNT_ID = 'test-account';
expect(getEnv('CF_ACCOUNT_ID')).toBe('test-account');
delete process.env.CF_ACCOUNT_ID;
});
});
+6926
View File
File diff suppressed because it is too large Load Diff
+26
View File
@@ -0,0 +1,26 @@
{
"name": "@lukaszraczylo/cloudflare-crawl-mcp",
"version": "1.0.0",
"description": "MCP server for Cloudflare Browser Rendering Crawl API",
"author": "Lukasz Raczylo <hello@raczylo.com> (https://raczylo.com)",
"main": "dist/index.js",
"type": "module",
"scripts": {
"build": "tsc",
"start": "node dist/index.js",
"test": "vitest run",
"test:watch": "vitest",
"prepublishOnly": "npm run build"
},
"dependencies": {
"@modelcontextprotocol/sdk": "^1.27.1"
},
"devDependencies": {
"@types/node": "^25.4.0",
"typescript": "^5.9.3",
"vitest": "^4.0.18"
},
"publishConfig": {
"access": "public"
}
}
+724
View File
@@ -0,0 +1,724 @@
import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest';
const API_BASE = "https://api.cloudflare.com/client/v4";
interface CrawlOptions {
url: string;
limit?: number;
depth?: number;
formats?: string[];
render?: boolean;
maxAge?: number;
source?: string;
options?: {
includeExternalLinks?: boolean;
includeSubdomains?: boolean;
includePatterns?: string[];
excludePatterns?: string[];
};
}
function getEnv(key: string): string {
const value = process.env[key];
if (!value) {
throw new Error(`Missing required environment variable: ${key}`);
}
return value;
}
async function initiateCrawl(
accountId: string,
apiToken: string,
options: CrawlOptions
): Promise<string> {
const response = await fetch(
`${API_BASE}/accounts/${accountId}/browser-rendering/crawl`,
{
method: "POST",
headers: {
Authorization: `Bearer ${apiToken}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
url: options.url,
limit: options.limit ?? 10,
depth: options.depth ?? 1,
formats: options.formats ?? ["markdown"],
render: options.render ?? true,
maxAge: options.maxAge,
source: options.source ?? "all",
options: options.options ?? {},
}),
}
);
if (!response.ok) {
const error = await response.text();
throw new Error(`Failed to initiate crawl: ${response.status} ${error}`);
}
const data = await response.json();
if (!data.success) {
throw new Error(`Crawl initiation failed: ${JSON.stringify(data.errors)}`);
}
return data.result.id;
}
async function waitForCrawl(
accountId: string,
apiToken: string,
jobId: string,
maxAttempts: number = 120,
delayMs: number = 5000
): Promise<any> {
for (let i = 0; i < maxAttempts; i++) {
const response = await fetch(
`${API_BASE}/accounts/${accountId}/browser-rendering/crawl/${jobId}?limit=1`,
{
headers: {
Authorization: `Bearer ${apiToken}`,
},
}
);
if (!response.ok) {
const error = await response.text();
throw new Error(`Failed to check crawl status: ${response.status} ${error}`);
}
const data = await response.json();
const status = data.result.status;
if (status !== "running") {
return data.result;
}
await new Promise((resolve) => setTimeout(resolve, delayMs));
}
throw new Error("Crawl job did not complete within timeout");
}
interface CrawlArgs {
url: string;
limit?: number;
depth?: number;
includeSubdomains?: boolean;
includeExternalLinks?: boolean;
includePatterns?: string[];
excludePatterns?: string[];
render?: boolean;
}
function buildCrawlOptions(args: CrawlArgs, formats: string[]): CrawlOptions {
return {
url: args.url,
limit: args.limit,
depth: args.depth,
formats,
render: args.render,
options: {
includeExternalLinks: args.includeExternalLinks,
includeSubdomains: args.includeSubdomains,
includePatterns: args.includePatterns,
excludePatterns: args.excludePatterns,
},
};
}
function formatMarkdownResult(result: any): string {
const records = result.records || [];
const completedRecords = records.filter((r: any) => r.status === "completed");
const content = completedRecords
.map((record: any) => {
const title = record.metadata?.title || record.url;
return `## ${title}\n\nURL: ${record.url}\n\n${record.markdown || ""}\n\n---\n`;
})
.join("\n");
return `Crawl completed: ${completedRecords.length} of ${result.total} pages crawled successfully.\n\n${content}`;
}
function formatHtmlResult(result: any): string {
const records = result.records || [];
const completedRecords = records.filter((r: any) => r.status === "completed");
const content = completedRecords
.map((record: any) => {
const title = record.metadata?.title || record.url;
return `<article>\n <h2>${title}</h2>\n <p>Source: <a href="${record.url}">${record.url}</a></p>\n <div class="content">${record.html || ""}</div>\n</article>\n`;
})
.join("\n");
return `Crawl completed: ${completedRecords.length} of ${result.total} pages crawled successfully.\n\n${content}`;
}
function formatJsonResult(result: any): string {
const records = result.records || [];
const completedRecords = records.filter((r: any) => r.status === "completed");
const jsonOutput = {
summary: {
total: result.total,
completed: completedRecords.length,
status: result.status,
},
pages: completedRecords.map((record: any) => ({
url: record.url,
title: record.metadata?.title,
status: record.metadata?.status,
markdown: record.markdown,
html: record.html,
json: record.json,
})),
};
return JSON.stringify(jsonOutput, null, 2);
}
function handleErrorResult(result: any, jobId: string): { content: any[]; isError: boolean } {
const errorMessages: Record<string, string> = {
errored: `Crawl job errored. Job ID: ${jobId}`,
cancelled_due_to_timeout: `Crawl job cancelled due to timeout (7 days max). Job ID: ${jobId}`,
cancelled_due_to_limits: `Crawl job cancelled due to account limits. Job ID: ${jobId}`,
cancelled_by_user: `Crawl job was cancelled by user. Job ID: ${jobId}`,
};
const message = errorMessages[result.status] || `Crawl job failed with status: ${result.status}. Job ID: ${jobId}`;
return {
content: [{ type: "text" as const, text: message }],
isError: true,
};
}
describe('getEnv', () => {
const testCases = [
{
name: 'returns value when env var exists',
envKey: 'TEST_VAR',
envValue: 'test-value',
expected: 'test-value',
},
{
name: 'throws when env var is empty string',
envKey: 'EMPTY_VAR',
envValue: '',
expectedError: 'Missing required environment variable: EMPTY_VAR',
},
{
name: 'throws when env var is undefined',
envKey: 'UNDEFINED_VAR',
envValue: undefined,
expectedError: 'Missing required environment variable: UNDEFINED_VAR',
},
];
it.each(testCases)('$name', ({ envKey, envValue, expected, expectedError }) => {
if (expectedError) {
if (envValue === undefined) {
delete process.env[envKey];
} else {
process.env[envKey] = envValue;
}
expect(() => getEnv(envKey)).toThrow(expectedError);
} else {
process.env[envKey] = envValue;
expect(getEnv(envKey)).toBe(expected);
}
});
});
describe('buildCrawlOptions', () => {
const testCases = [
{
name: 'builds options with markdown format',
args: { url: 'https://example.com' },
formats: ['markdown'],
expected: {
url: 'https://example.com',
limit: undefined,
depth: undefined,
formats: ['markdown'],
render: undefined,
options: {
includeExternalLinks: undefined,
includeSubdomains: undefined,
includePatterns: undefined,
excludePatterns: undefined,
},
},
},
{
name: 'builds options with all parameters',
args: {
url: 'https://example.com',
limit: 50,
depth: 2,
includeSubdomains: true,
includeExternalLinks: false,
includePatterns: ['**/docs/**'],
excludePatterns: ['**/archive/**'],
render: true,
},
formats: ['html'],
expected: {
url: 'https://example.com',
limit: 50,
depth: 2,
formats: ['html'],
render: true,
options: {
includeExternalLinks: false,
includeSubdomains: true,
includePatterns: ['**/docs/**'],
excludePatterns: ['**/archive/**'],
},
},
},
{
name: 'builds options with json format',
args: { url: 'https://api.example.com', limit: 100 },
formats: ['json'],
expected: {
url: 'https://api.example.com',
limit: 100,
formats: ['json'],
depth: undefined,
render: undefined,
options: {
includeExternalLinks: undefined,
includeSubdomains: undefined,
includePatterns: undefined,
excludePatterns: undefined,
},
},
},
{
name: 'handles empty options object',
args: { url: 'https://test.com' },
formats: ['markdown'],
expected: {
url: 'https://test.com',
formats: ['markdown'],
options: {
includeExternalLinks: undefined,
includeSubdomains: undefined,
includePatterns: undefined,
excludePatterns: undefined,
},
},
},
];
it.each(testCases)('$name', ({ args, formats, expected }) => {
const result = buildCrawlOptions(args, formats);
expect(result).toEqual(expected);
});
});
describe('formatMarkdownResult', () => {
const testCases = [
{
name: 'formats single completed page',
result: {
total: 1,
status: 'completed',
records: [
{
url: 'https://example.com',
status: 'completed',
markdown: '# Hello World',
metadata: { title: 'Home Page', status: 200 },
},
],
},
expectedContains: ['## Home Page', '# Hello World', 'Crawl completed: 1 of 1'],
},
{
name: 'formats multiple completed pages',
result: {
total: 2,
status: 'completed',
records: [
{
url: 'https://example.com',
status: 'completed',
markdown: '# Page 1',
metadata: { title: 'Page One', status: 200 },
},
{
url: 'https://example.com/about',
status: 'completed',
markdown: '# About Us',
metadata: { title: 'About', status: 200 },
},
],
},
expectedContains: ['## Page One', '## About', 'Crawl completed: 2 of 2'],
},
{
name: 'handles missing markdown content',
result: {
total: 1,
status: 'completed',
records: [
{
url: 'https://example.com',
status: 'completed',
markdown: '',
metadata: { title: 'Test', status: 200 },
},
],
},
expectedContains: ['## Test', 'URL: https://example.com'],
},
{
name: 'uses url as title when metadata.title is missing',
result: {
total: 1,
status: 'completed',
records: [
{
url: 'https://example.com/unnamed',
status: 'completed',
markdown: 'Content here',
},
],
},
expectedContains: ['## https://example.com/unnamed', 'Content here'],
},
{
name: 'handles empty records array',
result: {
total: 0,
status: 'completed',
records: [],
},
expectedContains: ['Crawl completed: 0 of 0'],
},
{
name: 'filters out non-completed records',
result: {
total: 3,
status: 'completed',
records: [
{ url: 'https://example.com/1', status: 'completed', markdown: '# Done' },
{ url: 'https://example.com/2', status: 'errored', markdown: '# Failed' },
{ url: 'https://example.com/3', status: 'skipped' },
],
},
expectedContains: ['Crawl completed: 1 of 3', '# Done'],
},
];
it.each(testCases)('$name', ({ result, expectedContains }) => {
const output = formatMarkdownResult(result);
expectedContains.forEach((expected) => {
expect(output).toContain(expected);
});
});
});
describe('formatHtmlResult', () => {
const testCases = [
{
name: 'formats single completed page with HTML',
result: {
total: 1,
status: 'completed',
records: [
{
url: 'https://example.com',
status: 'completed',
html: '<p>Hello World</p>',
metadata: { title: 'Home Page', status: 200 },
},
],
},
expectedContains: ['<h2>Home Page</h2>', '<p>Hello World</p>', 'Crawl completed: 1 of 1'],
},
{
name: 'formats multiple completed pages',
result: {
total: 2,
status: 'completed',
records: [
{
url: 'https://example.com',
status: 'completed',
html: '<div>Page 1</div>',
metadata: { title: 'Page One', status: 200 },
},
{
url: 'https://example.com/about',
status: 'completed',
html: '<div>About Us</div>',
metadata: { title: 'About', status: 200 },
},
],
},
expectedContains: ['<h2>Page One</h2>', '<h2>About</h2>', 'Crawl completed: 2 of 2'],
},
{
name: 'handles missing HTML content',
result: {
total: 1,
status: 'completed',
records: [
{
url: 'https://example.com',
status: 'completed',
html: '',
metadata: { title: 'Test', status: 200 },
},
],
},
expectedContains: ['<h2>Test</h2>', '<a href="https://example.com">'],
},
];
it.each(testCases)('$name', ({ result, expectedContains }) => {
const output = formatHtmlResult(result);
expectedContains.forEach((expected) => {
expect(output).toContain(expected);
});
});
});
describe('formatJsonResult', () => {
const testCases = [
{
name: 'formats single completed page as JSON',
result: {
total: 1,
status: 'completed',
records: [
{
url: 'https://example.com',
status: 'completed',
markdown: '# Hello',
html: '<h1>Hello</h1>',
json: { key: 'value' },
metadata: { title: 'Home', status: 200 },
},
],
},
},
{
name: 'formats multiple completed pages as JSON',
result: {
total: 2,
status: 'completed',
records: [
{
url: 'https://example.com/page1',
status: 'completed',
markdown: '# Page 1',
},
{
url: 'https://example.com/page2',
status: 'completed',
markdown: '# Page 2',
},
],
},
},
{
name: 'includes summary with correct counts',
result: {
total: 5,
status: 'completed',
records: [
{ url: 'https://example.com/1', status: 'completed' },
{ url: 'https://example.com/2', status: 'completed' },
{ url: 'https://example.com/3', status: 'errored' },
{ url: 'https://example.com/4', status: 'skipped' },
{ url: 'https://example.com/5', status: 'completed' },
],
},
},
{
name: 'handles empty records',
result: {
total: 0,
status: 'completed',
records: [],
},
},
];
it.each(testCases)('$name', ({ result }) => {
const output = formatJsonResult(result);
const parsed = JSON.parse(output);
expect(parsed).toHaveProperty('summary');
expect(parsed).toHaveProperty('pages');
const completedCount = result.records.filter((r: any) => r.status === 'completed').length;
expect(parsed.summary.completed).toBe(completedCount);
expect(parsed.summary.total).toBe(result.total);
expect(parsed.summary.status).toBe(result.status);
});
});
describe('handleErrorResult', () => {
const testCases = [
{
name: 'handles errored status',
result: { status: 'errored' },
jobId: 'test-job-123',
expectedError: true,
expectedContains: ['errored', 'test-job-123'],
},
{
name: 'handles cancelled_due_to_timeout status',
result: { status: 'cancelled_due_to_timeout' },
jobId: 'job-456',
expectedError: true,
expectedContains: ['timeout', 'job-456'],
},
{
name: 'handles cancelled_due_to_limits status',
result: { status: 'cancelled_due_to_limits' },
jobId: 'job-789',
expectedError: true,
expectedContains: ['limits', 'job-789'],
},
{
name: 'handles cancelled_by_user status',
result: { status: 'cancelled_by_user' },
jobId: 'job-000',
expectedError: true,
expectedContains: ['cancelled by user', 'job-000'],
},
{
name: 'handles unknown status',
result: { status: 'some_unknown_status' },
jobId: 'job-unknown',
expectedError: true,
expectedContains: ['some_unknown_status', 'job-unknown'],
},
];
it.each(testCases)('$name', ({ result, jobId, expectedError, expectedContains }) => {
const output = handleErrorResult(result, jobId);
expect(output.isError).toBe(expectedError);
expectedContains.forEach((expected) => {
expect(output.content[0].text).toContain(expected);
});
});
});
describe('initiateCrawl', () => {
beforeEach(() => {
vi.stubGlobal('fetch', vi.fn());
});
const testCases = [
{
name: 'initiates crawl successfully',
accountId: 'acc-123',
apiToken: 'token-abc',
options: { url: 'https://example.com', formats: ['markdown'] },
mockResponse: { success: true, result: { id: 'job-123' } },
expectedJobId: 'job-123',
},
{
name: 'throws on HTTP error',
accountId: 'acc-123',
apiToken: 'token-abc',
options: { url: 'https://example.com' },
mockResponse: null,
mockStatus: 401,
expectedError: 'Failed to initiate crawl: 401',
},
{
name: 'throws on API failure',
accountId: 'acc-123',
apiToken: 'token-abc',
options: { url: 'https://example.com' },
mockResponse: { success: false, errors: [{ message: 'Invalid URL' }] },
expectedError: 'Crawl initiation failed',
},
];
it.each(testCases)('$name', async ({ accountId, apiToken, options, mockResponse, mockStatus, expectedJobId, expectedError }) => {
const fetchMock = vi.mocked(fetch);
if (expectedError) {
if (mockStatus) {
fetchMock.mockResolvedValueOnce(new Response('', { status: mockStatus }));
} else {
fetchMock.mockResolvedValueOnce(new Response(JSON.stringify(mockResponse), {
status: 200,
headers: { 'Content-Type': 'application/json' }
}));
}
await expect(initiateCrawl(accountId, apiToken, options)).rejects.toThrow(expectedError);
} else {
fetchMock.mockResolvedValueOnce(new Response(JSON.stringify(mockResponse), {
status: 200,
headers: { 'Content-Type': 'application/json' }
}));
const result = await initiateCrawl(accountId, apiToken, options);
expect(result).toBe(expectedJobId);
}
});
});
describe('waitForCrawl', () => {
beforeEach(() => {
vi.stubGlobal('fetch', vi.fn());
});
const testCases: Array<{
name: string;
accountId: string;
apiToken: string;
jobId: string;
mockResponse: any;
expectedStatus: string;
}> = [
{
name: 'returns completed result immediately',
accountId: 'acc-123',
apiToken: 'token-abc',
jobId: 'job-123',
mockResponse: { result: { status: 'completed', total: 5, records: [] } },
expectedStatus: 'completed',
},
{
name: 'returns errored result',
accountId: 'acc-123',
apiToken: 'token-abc',
jobId: 'job-123',
mockResponse: { result: { status: 'errored', error: 'Something went wrong' } },
expectedStatus: 'errored',
},
{
name: 'returns cancelled_due_to_limits result',
accountId: 'acc-123',
apiToken: 'token-abc',
jobId: 'job-123',
mockResponse: { result: { status: 'cancelled_due_to_limits' } },
expectedStatus: 'cancelled_due_to_limits',
},
];
it.each(testCases)('$name', async ({ accountId, apiToken, jobId, mockResponse, expectedStatus }) => {
const fetchMock = vi.mocked(fetch);
fetchMock.mockResolvedValue(new Response(JSON.stringify(mockResponse), {
status: 200,
headers: { 'Content-Type': 'application/json' }
}));
const result = await waitForCrawl(accountId, apiToken, jobId, 1, 1);
expect(result.status).toBe(expectedStatus);
});
});
+449
View File
@@ -0,0 +1,449 @@
import { Server } from "@modelcontextprotocol/sdk/server/index.js";
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
import {
CallToolRequestSchema,
ListToolsRequestSchema,
} from "@modelcontextprotocol/sdk/types.js";
const API_BASE = "https://api.cloudflare.com/client/v4";
const MAX_RETRIES = 3;
const RATE_LIMIT_DELAY_MS = 10000;
let lastRequestTime = 0;
let requestCount = 0;
let windowStart = Date.now();
interface CrawlOptions {
url: string;
limit?: number;
depth?: number;
formats?: string[];
render?: boolean;
maxAge?: number;
source?: string;
options?: {
includeExternalLinks?: boolean;
includeSubdomains?: boolean;
includePatterns?: string[];
excludePatterns?: string[];
};
}
function getEnv(key: string): string {
const value = process.env[key];
if (!value) {
throw new Error(`Missing required environment variable: ${key}`);
}
return value;
}
async function enforceRateLimit(): Promise<void> {
const now = Date.now();
const windowDuration = 60000;
if (now - windowStart >= windowDuration) {
requestCount = 0;
windowStart = now;
}
const requestsPerMinute = parseInt(process.env.CF_RATE_LIMIT || "6", 10);
if (requestCount >= requestsPerMinute) {
const waitTime = windowDuration - (now - windowStart);
console.error(`Rate limit reached (${requestsPerMinute}/min). Waiting ${waitTime}ms...`);
await new Promise((resolve) => setTimeout(resolve, waitTime));
requestCount = 0;
windowStart = Date.now();
}
const timeSinceLastRequest = now - lastRequestTime;
if (timeSinceLastRequest < RATE_LIMIT_DELAY_MS && requestCount > 0) {
const waitTime = RATE_LIMIT_DELAY_MS - timeSinceLastRequest;
await new Promise((resolve) => setTimeout(resolve, waitTime));
}
lastRequestTime = Date.now();
requestCount++;
}
async function fetchWithRetry<T>(
fn: () => Promise<T>,
retries: number = MAX_RETRIES
): Promise<T> {
let lastError: Error | null = null;
for (let attempt = 0; attempt < retries; attempt++) {
try {
return await fn();
} catch (error: any) {
lastError = error;
const errorStr = error.message || "";
const isRateLimit = errorStr.includes("429") ||
errorStr.includes("Rate limit");
if (!isRateLimit || attempt === retries - 1) {
throw error;
}
const retryAfterMatch = errorStr.match(/Retry-After[:\s]*(\d+)/i);
const delay = retryAfterMatch
? parseInt(retryAfterMatch[1], 10) * 1000
: Math.min(1000 * Math.pow(2, attempt), 30000);
console.error(`Rate limited. Retrying in ${delay}ms...`);
await new Promise((resolve) => setTimeout(resolve, delay));
}
}
throw lastError;
}
async function initiateCrawl(
accountId: string,
apiToken: string,
options: CrawlOptions
): Promise<string> {
await enforceRateLimit();
return fetchWithRetry(async () => {
const response = await fetch(
`${API_BASE}/accounts/${accountId}/browser-rendering/crawl`,
{
method: "POST",
headers: {
Authorization: `Bearer ${apiToken}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
url: options.url,
limit: options.limit ?? 10,
depth: options.depth ?? 1,
formats: options.formats ?? ["markdown"],
render: options.render ?? true,
maxAge: options.maxAge,
source: options.source ?? "all",
options: options.options ?? {},
}),
}
);
if (!response.ok) {
const error = await response.text();
const retryAfter = response.headers.get("Retry-After");
const errorMsg = `Failed to initiate crawl: ${response.status} ${error}${retryAfter ? ` Retry-After: ${retryAfter}` : ""}`;
throw new Error(errorMsg);
}
const data = await response.json();
if (!data.success) {
throw new Error(`Crawl initiation failed: ${JSON.stringify(data.errors)}`);
}
return data.result.id;
});
}
async function waitForCrawl(
accountId: string,
apiToken: string,
jobId: string,
maxAttempts: number = 120,
delayMs: number = 5000
): Promise<any> {
for (let i = 0; i < maxAttempts; i++) {
const response = await fetch(
`${API_BASE}/accounts/${accountId}/browser-rendering/crawl/${jobId}?limit=1`,
{
headers: {
Authorization: `Bearer ${apiToken}`,
},
}
);
if (!response.ok) {
const error = await response.text();
throw new Error(`Failed to check crawl status: ${response.status} ${error}`);
}
const data = await response.json();
const status = data.result.status;
if (status !== "running") {
return data.result;
}
await new Promise((resolve) => setTimeout(resolve, delayMs));
}
throw new Error("Crawl job did not complete within timeout");
}
interface CrawlArgs {
url: string;
limit?: number;
depth?: number;
includeSubdomains?: boolean;
includeExternalLinks?: boolean;
includePatterns?: string[];
excludePatterns?: string[];
render?: boolean;
}
function buildCrawlOptions(args: CrawlArgs, formats: string[]): CrawlOptions {
return {
url: args.url,
limit: args.limit,
depth: args.depth,
formats,
render: args.render,
options: {
includeExternalLinks: args.includeExternalLinks,
includeSubdomains: args.includeSubdomains,
includePatterns: args.includePatterns,
excludePatterns: args.excludePatterns,
},
};
}
function formatMarkdownResult(result: any): string {
const records = result.records || [];
const completedRecords = records.filter((r: any) => r.status === "completed");
const content = completedRecords
.map((record: any) => {
const title = record.metadata?.title || record.url;
return `## ${title}\n\nURL: ${record.url}\n\n${record.markdown || ""}\n\n---\n`;
})
.join("\n");
return `Crawl completed: ${completedRecords.length} of ${result.total} pages crawled successfully.\n\n${content}`;
}
function formatHtmlResult(result: any): string {
const records = result.records || [];
const completedRecords = records.filter((r: any) => r.status === "completed");
const content = completedRecords
.map((record: any) => {
const title = record.metadata?.title || record.url;
return `<article>\n <h2>${title}</h2>\n <p>Source: <a href="${record.url}">${record.url}</a></p>\n <div class="content">${record.html || ""}</div>\n</article>\n`;
})
.join("\n");
return `Crawl completed: ${completedRecords.length} of ${result.total} pages crawled successfully.\n\n${content}`;
}
function formatJsonResult(result: any): string {
const records = result.records || [];
const completedRecords = records.filter((r: any) => r.status === "completed");
const jsonOutput = {
summary: {
total: result.total,
completed: completedRecords.length,
status: result.status,
},
pages: completedRecords.map((record: any) => ({
url: record.url,
title: record.metadata?.title,
status: record.metadata?.status,
markdown: record.markdown,
html: record.html,
json: record.json,
})),
};
return JSON.stringify(jsonOutput, null, 2);
}
function handleErrorResult(result: any, jobId: string): { content: any[]; isError: boolean } {
const errorMessages: Record<string, string> = {
errored: `Crawl job errored. Job ID: ${jobId}`,
cancelled_due_to_timeout: `Crawl job cancelled due to timeout (7 days max). Job ID: ${jobId}`,
cancelled_due_to_limits: `Crawl job cancelled due to account limits. Job ID: ${jobId}`,
cancelled_by_user: `Crawl job was cancelled by user. Job ID: ${jobId}`,
};
const message = errorMessages[result.status] || `Crawl job failed with status: ${result.status}. Job ID: ${jobId}`;
return {
content: [{ type: "text" as const, text: message }],
isError: true,
};
}
const server = new Server(
{
name: "cloudflare-crawl-mcp",
version: "1.0.0",
},
{
capabilities: {
tools: {},
},
}
);
const baseToolSchema = {
type: "object" as const,
properties: {
url: {
type: "string" as const,
description: "The starting URL to crawl",
},
limit: {
type: "number" as const,
description: "Maximum number of pages to crawl (default: 10, max: 100000)",
},
depth: {
type: "number" as const,
description: "Maximum link depth to crawl from the starting URL (default: 1)",
},
includeSubdomains: {
type: "boolean" as const,
description: "If true, follows links to subdomains of the starting URL (default: false)",
},
includeExternalLinks: {
type: "boolean" as const,
description: "If true, follows links to external domains (default: false)",
},
includePatterns: {
type: "array" as const,
items: { type: "string" as const },
description: "Only visits URLs that match one of these wildcard patterns",
},
excludePatterns: {
type: "array" as const,
items: { type: "string" as const },
description: "Does not visit URLs that match any of these wildcard patterns",
},
render: {
type: "boolean" as const,
description: "If false, does a fast HTML fetch without executing JavaScript (default: true)",
},
},
required: ["url"] as string[],
};
const RATE_LIMIT_INFO = `
---
**Cloudflare Browser Rendering Limits:**
| Plan | Concurrent Browsers | Browser Time | REST API Rate |
|------|---------------------|--------------|---------------|
| Free | 3 | 10 min/day | 6 req/min |
| Paid | 10 | 10 hours/month | 600 req/min |
**Environment Variables:**
- CF_RATE_LIMIT: Override REST API requests per minute (default: 6 for Free, 600 for Paid)
**Tips:**
- Use \`render: false\` for static content to avoid browser time usage
- Use \`maxAge\` to cache results and reduce API calls
- Set \`limit\` and \`depth\` appropriately to stay within limits
---`;
server.setRequestHandler(ListToolsRequestSchema, async () => {
return {
tools: [
{
name: "crawl_url_markdown",
description:
`Crawl a website using Cloudflare Browser Rendering and return content in Markdown format. Supports following links across the site up to a configurable depth or page limit.${RATE_LIMIT_INFO}`,
inputSchema: {
...baseToolSchema,
properties: {
...baseToolSchema.properties,
},
},
},
{
name: "crawl_url_html",
description:
`Crawl a website using Cloudflare Browser Rendering and return content in HTML format. Supports following links across the site up to a configurable depth or page limit.${RATE_LIMIT_INFO}`,
inputSchema: baseToolSchema,
},
{
name: "crawl_url_json",
description:
`Crawl a website using Cloudflare Browser Rendering and return content in JSON format. This uses Workers AI for data extraction. Supports following links across the site up to a configurable depth or page limit.${RATE_LIMIT_INFO}`,
inputSchema: baseToolSchema,
},
],
};
});
server.setRequestHandler(CallToolRequestSchema, async (request) => {
const { name, arguments: args } = request.params as { name: string; arguments: Record<string, unknown> };
const toolMatch = name.match(/^crawl_url_(markdown|html|json)$/);
if (!toolMatch) {
return {
content: [{ type: "text", text: `Unknown tool: ${name}` }],
isError: true,
};
}
const format = toolMatch[1] as "markdown" | "html" | "json";
const formatMap: Record<string, string[]> = {
markdown: ["markdown"],
html: ["html"],
json: ["json"],
};
const formats = formatMap[format];
try {
const apiToken = getEnv("CF_API_TOKEN");
const accountId = getEnv("CF_ACCOUNT_ID");
const crawlArgs: CrawlArgs = {
url: args.url as string,
limit: args.limit as number | undefined,
depth: args.depth as number | undefined,
includeSubdomains: args.includeSubdomains as boolean | undefined,
includeExternalLinks: args.includeExternalLinks as boolean | undefined,
includePatterns: args.includePatterns as string[] | undefined,
excludePatterns: args.excludePatterns as string[] | undefined,
render: args.render as boolean | undefined,
};
const options = buildCrawlOptions(crawlArgs, formats);
const jobId = await initiateCrawl(accountId, apiToken, options);
const result = await waitForCrawl(accountId, apiToken, jobId);
const terminalStatuses = ["errored", "cancelled_due_to_timeout", "cancelled_due_to_limits", "cancelled_by_user"];
if (terminalStatuses.includes(result.status)) {
return handleErrorResult(result, jobId);
}
const formatterMap: Record<string, (result: any) => string> = {
markdown: formatMarkdownResult,
html: formatHtmlResult,
json: formatJsonResult,
};
const formattedContent = formatterMap[format](result);
return {
content: [{ type: "text", text: formattedContent }],
};
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
return {
content: [{ type: "text", text: `Error: ${message}` }],
isError: true,
};
}
});
async function main() {
const transport = new StdioServerTransport();
await server.connect(transport);
}
main().catch((error) => {
console.error("Server error:", error);
process.exit(1);
});
+193
View File
@@ -0,0 +1,193 @@
import { describe, it, expect, beforeAll } from 'vitest';
const API_BASE = "https://api.cloudflare.com/client/v4";
interface CrawlOptions {
url: string;
limit?: number;
depth?: number;
formats?: string[];
render?: boolean;
options?: {
includeExternalLinks?: boolean;
includeSubdomains?: boolean;
includePatterns?: string[];
excludePatterns?: string[];
};
}
async function initiateCrawl(
accountId: string,
apiToken: string,
options: CrawlOptions
): Promise<string> {
const response = await fetch(
`${API_BASE}/accounts/${accountId}/browser-rendering/crawl`,
{
method: "POST",
headers: {
Authorization: `Bearer ${apiToken}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
url: options.url,
limit: options.limit ?? 10,
depth: options.depth ?? 1,
formats: options.formats ?? ["markdown"],
render: options.render ?? true,
options: options.options ?? {},
}),
}
);
if (!response.ok) {
const error = await response.text();
throw new Error(`Failed to initiate crawl: ${response.status} ${error}`);
}
const data = await response.json();
if (!data.success) {
throw new Error(`Crawl initiation failed: ${JSON.stringify(data.errors)}`);
}
return data.result.id;
}
async function waitForCrawl(
accountId: string,
apiToken: string,
jobId: string,
maxAttempts: number = 60,
delayMs: number = 5000
): Promise<any> {
for (let i = 0; i < maxAttempts; i++) {
const response = await fetch(
`${API_BASE}/accounts/${accountId}/browser-rendering/crawl/${jobId}?limit=1`,
{
headers: {
Authorization: `Bearer ${apiToken}`,
},
}
);
if (!response.ok) {
const error = await response.text();
throw new Error(`Failed to check crawl status: ${response.status} ${error}`);
}
const data = await response.json();
const status = data.result.status;
if (status !== "running") {
return data.result;
}
await new Promise((resolve) => setTimeout(resolve, delayMs));
}
throw new Error("Crawl job did not complete within timeout");
}
function getEnv(key: string): string {
const value = process.env[key];
if (!value) {
throw new Error(`Missing required environment variable: ${key}`);
}
return value;
}
describe('Integration: Cloudflare Crawl API', () => {
const apiToken = process.env.CF_API_TOKEN;
const accountId = process.env.CF_ACCOUNT_ID;
const hasCredentials = apiToken && accountId;
beforeAll(() => {
if (!hasCredentials) {
console.log('\n⚠️ Skipping integration tests - CF_API_TOKEN or CF_ACCOUNT_ID not set\n');
}
});
it.skipIf(!hasCredentials)('should crawl raczylo.com with multiple pages in markdown format', async () => {
const accountId = getEnv("CF_ACCOUNT_ID");
const apiToken = getEnv("CF_API_TOKEN");
try {
const jobId = await initiateCrawl(accountId, apiToken, {
url: "https://raczylo.com",
limit: 5,
depth: 2,
formats: ["markdown"],
});
console.log(` Started crawl job: ${jobId}`);
expect(jobId).toBeDefined();
expect(typeof jobId).toBe("string");
const result = await waitForCrawl(accountId, apiToken, jobId, 60, 5000);
console.log(` Crawl status: ${result.status}`);
console.log(` Total pages discovered: ${result.total}`);
console.log(` Pages finished: ${result.finished}`);
expect(result.status).toBe("completed");
expect(result.total).toBeGreaterThan(0);
expect(result.records).toBeDefined();
expect(Array.isArray(result.records)).toBe(true);
expect(result.records.length).toBeGreaterThan(0);
const completedRecords = result.records.filter((r: any) => r.status === "completed");
console.log(` Completed pages: ${completedRecords.length}`);
completedRecords.forEach((record: any, index: number) => {
expect(record.url).toBeDefined();
expect(record.markdown).toBeDefined();
expect(record.markdown.length).toBeGreaterThan(0);
console.log(` Page ${index + 1}: ${record.url} (${record.markdown.length} chars)`);
});
const firstRecord = result.records[0];
expect(firstRecord.markdown).toContain("#");
} catch (error: any) {
if (error.message.includes("Rate limit")) {
console.log(" ⚠️ Skipped - Rate limit exceeded");
return;
}
throw error;
}
}, 360000);
});
describe('Environment Variable Validation', () => {
const testCases = [
{
name: 'CF_API_TOKEN is required',
envKey: 'CF_API_TOKEN',
expectedError: 'Missing required environment variable: CF_API_TOKEN',
},
{
name: 'CF_ACCOUNT_ID is required',
envKey: 'CF_ACCOUNT_ID',
expectedError: 'Missing required environment variable: CF_ACCOUNT_ID',
},
];
it.each(testCases)('$name', ({ envKey, expectedError }) => {
delete process.env[envKey];
expect(() => getEnv(envKey)).toThrow(expectedError);
});
it('should return value when CF_API_TOKEN is set', () => {
process.env.CF_API_TOKEN = 'test-token';
expect(getEnv('CF_API_TOKEN')).toBe('test-token');
delete process.env.CF_API_TOKEN;
});
it('should return value when CF_ACCOUNT_ID is set', () => {
process.env.CF_ACCOUNT_ID = 'test-account';
expect(getEnv('CF_ACCOUNT_ID')).toBe('test-account');
delete process.env.CF_ACCOUNT_ID;
});
});
+14
View File
@@ -0,0 +1,14 @@
{
"compilerOptions": {
"target": "ES2022",
"module": "ESNext",
"moduleResolution": "bundler",
"esModuleInterop": true,
"strict": true,
"outDir": "dist",
"rootDir": "src",
"declaration": true,
"skipLibCheck": true
},
"include": ["src/**/*"]
}
+8
View File
@@ -0,0 +1,8 @@
import { defineConfig } from 'vitest/config';
export default defineConfig({
test: {
include: ['src/**/*.test.ts'],
exclude: ['dist/**'],
},
});