mirror of
https://github.com/lukaszraczylo/cloudflare-crawl-mcp.git
synced 2026-06-11 23:29:21 +00:00
Initial commit.
This commit is contained in:
@@ -0,0 +1,9 @@
|
||||
# Cloudflare API Token (get from https://dash.cloudflare.com/profile/api-tokens)
|
||||
# Required permissions: Account > Browser Rendering > Edit
|
||||
CF_API_TOKEN=your_cloudflare_api_token
|
||||
|
||||
# Cloudflare Account ID (get from https://dash.cloudflare.com/_/account)
|
||||
CF_ACCOUNT_ID=your_cloudflare_account_id
|
||||
|
||||
# Rate limit: REST API requests per minute (default: 6 for Free, 600 for Paid)
|
||||
# CF_RATE_LIMIT=6
|
||||
@@ -0,0 +1,43 @@
|
||||
name: Test and Release
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main]
|
||||
pull_request:
|
||||
branches: [main]
|
||||
release:
|
||||
types: [published]
|
||||
|
||||
jobs:
|
||||
test:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: '20'
|
||||
cache: 'npm'
|
||||
- run: npm ci
|
||||
- run: npm run build
|
||||
- run: npm test
|
||||
|
||||
release:
|
||||
needs: test
|
||||
runs-on: ubuntu-latest
|
||||
if: github.event_name == 'release'
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: '20'
|
||||
registry-url: 'https://registry.npmjs.org'
|
||||
- run: npm ci
|
||||
- run: npm run build
|
||||
- name: Run GoReleaser
|
||||
uses: goreleaser/goreleaser-action@v6
|
||||
with:
|
||||
version: latest
|
||||
args: release --clean
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
NPM_TOKEN: ${{ secrets.NPM_TOKEN }}
|
||||
@@ -0,0 +1,2 @@
|
||||
node_modules
|
||||
.envrc
|
||||
@@ -0,0 +1,51 @@
|
||||
# Goreleaser config for @lukaszraczylo/cloudflare-crawl-mcp
|
||||
# Publishes to npm on release
|
||||
|
||||
project_name: cloudflare-crawl-mcp
|
||||
|
||||
before:
|
||||
hooks:
|
||||
- npm ci
|
||||
- npm run build
|
||||
|
||||
builds:
|
||||
- id: cloudflare-crawl-mcp
|
||||
dir: .
|
||||
env:
|
||||
- CGO_ENABLED=0
|
||||
goos:
|
||||
- linux
|
||||
- darwin
|
||||
- windows
|
||||
goarch:
|
||||
- amd64
|
||||
- arm64
|
||||
|
||||
archives:
|
||||
- id: default
|
||||
format: tarball
|
||||
|
||||
snapshot:
|
||||
name_template: "{{ .Tag }}-next"
|
||||
|
||||
changelog:
|
||||
sort: asc
|
||||
filters:
|
||||
exclude:
|
||||
- "^docs:"
|
||||
- "^test:"
|
||||
- "^chore:"
|
||||
|
||||
release:
|
||||
github:
|
||||
owner: lukaszraczylo
|
||||
name: cloudflare-crawl-mcp
|
||||
draft: false
|
||||
|
||||
npm:
|
||||
name: "@lukaszraczylo/cloudflare-crawl-mcp"
|
||||
channel: "latest"
|
||||
dir: "."
|
||||
skip_upload: false
|
||||
scripts:
|
||||
post_pack: npm run build
|
||||
+30
@@ -0,0 +1,30 @@
|
||||
# Dependencies
|
||||
node_modules/
|
||||
|
||||
# Build output
|
||||
dist/
|
||||
release/
|
||||
|
||||
# Test coverage
|
||||
coverage/
|
||||
|
||||
# Development
|
||||
.vscode/
|
||||
.idea/
|
||||
*.log
|
||||
|
||||
# Git
|
||||
.git/
|
||||
.github/
|
||||
|
||||
# Config
|
||||
.env
|
||||
.env.*
|
||||
!.env.example
|
||||
|
||||
# Misc
|
||||
*.ts
|
||||
!*.d.ts
|
||||
tsconfig.json
|
||||
vitest.config.ts
|
||||
.goreleaser.yaml
|
||||
@@ -0,0 +1,2 @@
|
||||
github: lukaszraczylo
|
||||
custom: https://github.com/sponsors/lukaszraczylo
|
||||
@@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2025 Lukasz Raczylo
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
@@ -0,0 +1,297 @@
|
||||
# @lukaszraczylo/cloudflare-crawl-mcp
|
||||
|
||||
<p align="center">
|
||||
<a href="https://www.npmjs.com/package/@lukaszraczylo/cloudflare-crawl-mcp">
|
||||
<img src="https://img.shields.io/npm/v/@lukaszraczylo/cloudflare-crawl-mcp" alt="NPM Version">
|
||||
</a>
|
||||
<a href="https://github.com/lukaszraczylo/cloudflare-crawl-mcp/blob/main/LICENSE">
|
||||
<img src="https://img.shields.io/badge/license-MIT-blue.svg" alt="License">
|
||||
</a>
|
||||
</p>
|
||||
|
||||
MCP server for crawling websites using Cloudflare Browser Rendering API. Supports multiple output formats including Markdown, HTML, and JSON.
|
||||
|
||||
## Features
|
||||
|
||||
- **Multiple Output Formats**: Choose between Markdown, HTML, or JSON output
|
||||
- **Configurable Crawling**: Control depth, page limits, and link following
|
||||
- **Pattern Filtering**: Include/exclude URLs using wildcard patterns
|
||||
- **JavaScript Rendering**: Execute JavaScript for dynamic content (or disable for static content)
|
||||
- **Environment-Based Secrets**: Securely manage credentials via environment variables
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Node.js 18+
|
||||
- Cloudflare account with Browser Rendering API access
|
||||
- Cloudflare API Token with `Browser Rendering` permissions
|
||||
- Cloudflare Account ID
|
||||
|
||||
## Quick Start
|
||||
|
||||
```bash
|
||||
# Clone and setup
|
||||
npm install
|
||||
npm run build
|
||||
|
||||
# Run with environment variables
|
||||
CF_API_TOKEN=your_token CF_ACCOUNT_ID=your_account_id npm start
|
||||
```
|
||||
|
||||
## Installation
|
||||
|
||||
### 1. Clone the Repository
|
||||
|
||||
```bash
|
||||
git clone https://github.com/lukaszraczylo/cloudflare-crawl-mcp.git
|
||||
cd cloudflare-crawl-mcp
|
||||
```
|
||||
|
||||
### 2. Install Dependencies
|
||||
|
||||
```bash
|
||||
npm install
|
||||
```
|
||||
|
||||
### 3. Build the Server
|
||||
|
||||
```bash
|
||||
npm run build
|
||||
```
|
||||
|
||||
### 4. Configure Environment Variables
|
||||
|
||||
Copy the example environment file and add your credentials:
|
||||
|
||||
```bash
|
||||
cp .env.example .env
|
||||
```
|
||||
|
||||
Edit `.env` with your Cloudflare credentials:
|
||||
|
||||
```
|
||||
CF_API_TOKEN=your_cloudflare_api_token
|
||||
CF_ACCOUNT_ID=your_cloudflare_account_id
|
||||
```
|
||||
|
||||
#### Getting Cloudflare Credentials
|
||||
|
||||
1. **Account ID**: Find it at https://dash.cloudflare.com/_/account
|
||||
2. **API Token**: Create one at https://dash.cloudflare.com/profile/api-tokens with these permissions:
|
||||
- `Account` > `Browser Rendering` > `Edit`
|
||||
|
||||
## MCP Configuration
|
||||
|
||||
### Claude Desktop (macOS)
|
||||
|
||||
Add to `~/Library/Application Support/Claude/claude_desktop_config.json`:
|
||||
|
||||
```json
|
||||
{
|
||||
"mcpServers": {
|
||||
"cloudflare-crawl": {
|
||||
"command": "npm",
|
||||
"args": ["start"],
|
||||
"env": {
|
||||
"CF_API_TOKEN": "your_api_token",
|
||||
"CF_ACCOUNT_ID": "your_account_id"
|
||||
},
|
||||
"path": "/path/to/cloudflare-crawl-mcp"
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Claude Code (CLI)
|
||||
|
||||
```json
|
||||
{
|
||||
"mcpServers": {
|
||||
"cloudflare-crawl": {
|
||||
"command": "npm",
|
||||
"args": ["start"],
|
||||
"env": {
|
||||
"CF_API_TOKEN": "your_api_token",
|
||||
"CF_ACCOUNT_ID": "your_account_id"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Cursor
|
||||
|
||||
Add to `~/.cursor/settings.json` (MCP configuration):
|
||||
|
||||
```json
|
||||
{
|
||||
"mcpServers": {
|
||||
"cloudflare-crawl": {
|
||||
"command": "npm",
|
||||
"args": ["start"],
|
||||
"env": {
|
||||
"CF_API_TOKEN": "your_api_token",
|
||||
"CF_ACCOUNT_ID": "your_account_id"
|
||||
},
|
||||
"path": "/path/to/cloudflare-crawl-mcp"
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Available Tools
|
||||
|
||||
### crawl_url_markdown
|
||||
|
||||
Crawl a website and return content in **Markdown** format.
|
||||
|
||||
```typescript
|
||||
{
|
||||
"name": "crawl_url_markdown",
|
||||
"arguments": {
|
||||
"url": "https://example.com/docs",
|
||||
"limit": 50,
|
||||
"depth": 2,
|
||||
"includePatterns": ["https://example.com/docs/**"],
|
||||
"excludePatterns": ["https://example.com/docs/archive/**"],
|
||||
"render": true
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### crawl_url_html
|
||||
|
||||
Crawl a website and return content in **HTML** format.
|
||||
|
||||
```typescript
|
||||
{
|
||||
"name": "crawl_url_html",
|
||||
"arguments": {
|
||||
"url": "https://example.com",
|
||||
"limit": 10
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### crawl_url_json
|
||||
|
||||
Crawl a website and return content in **JSON** format (uses Workers AI for data extraction).
|
||||
|
||||
```typescript
|
||||
{
|
||||
"name": "crawl_url_json",
|
||||
"arguments": {
|
||||
"url": "https://example.com/products",
|
||||
"limit": 20
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Parameters
|
||||
|
||||
| Parameter | Type | Default | Description |
|
||||
|-----------|------|---------|-------------|
|
||||
| `url` | string | required | Starting URL to crawl |
|
||||
| `limit` | number | 10 | Maximum pages to crawl (max: 100,000) |
|
||||
| `depth` | number | 1 | Maximum link depth from starting URL |
|
||||
| `includeSubdomains` | boolean | false | Follow links to subdomains |
|
||||
| `includeExternalLinks` | boolean | false | Follow links to external domains |
|
||||
| `includePatterns` | string[] | [] | Wildcard patterns to include |
|
||||
| `excludePatterns` | string[] | [] | Wildcard patterns to exclude |
|
||||
| `render` | boolean | true | Execute JavaScript (false = faster static fetch) |
|
||||
|
||||
### Pattern Syntax
|
||||
|
||||
- `*` - Matches any characters except `/`
|
||||
- `**` - Matches any characters including `/`
|
||||
|
||||
Examples:
|
||||
- `https://example.com/docs/**` - All URLs under /docs
|
||||
- `https://example.com/*.html` - All HTML files directly in root
|
||||
|
||||
## Development
|
||||
|
||||
### Commands
|
||||
|
||||
```bash
|
||||
npm run build # Build TypeScript
|
||||
npm start # Run server
|
||||
npm test # Run tests
|
||||
npm run test:watch # Run tests in watch mode
|
||||
```
|
||||
|
||||
### Testing
|
||||
|
||||
The project includes comprehensive tests covering:
|
||||
|
||||
- Environment variable handling
|
||||
- Crawl options building
|
||||
- Result formatting (Markdown, HTML, JSON)
|
||||
- Error handling
|
||||
- API integration
|
||||
|
||||
Run tests:
|
||||
```bash
|
||||
npm test
|
||||
```
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
src/
|
||||
├── index.ts # Main MCP server implementation
|
||||
│
|
||||
├── API Layer
|
||||
│ ├── initiateCrawl() # POST to /crawl endpoint
|
||||
│ ├── waitForCrawl() # Poll for job completion
|
||||
│ └── getCrawlResults() # Fetch final results
|
||||
│
|
||||
├── Formatters
|
||||
│ ├── formatMarkdownResult()
|
||||
│ ├── formatHtmlResult()
|
||||
│ └── formatJsonResult()
|
||||
│
|
||||
└── MCP Handlers
|
||||
├── ListToolsRequestSchema # Tool registration
|
||||
└── CallToolRequestSchema # Tool execution
|
||||
```
|
||||
|
||||
## Cloudflare Limits
|
||||
|
||||
- **Max crawl duration**: 7 days
|
||||
- **Results available**: 14 days after completion
|
||||
- **Max pages per job**: 100,000
|
||||
- **Free plan**: 10 minutes of browser time per day
|
||||
|
||||
See [Cloudflare Browser Rendering Limits](https://developers.cloudflare.com/browser-rendering/limits/) for details.
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Crawl returns no results
|
||||
|
||||
- Check `robots.txt` blocking (use `render: false` to bypass)
|
||||
- Verify `includePatterns` match actual URLs
|
||||
- Try increasing `depth` or disabling pattern filters
|
||||
|
||||
### Job cancelled due to limits
|
||||
|
||||
- Upgrade to Workers Paid plan
|
||||
- Use `render: false` for static content
|
||||
- Reduce `limit` parameter
|
||||
|
||||
### Authentication errors
|
||||
|
||||
- Verify API Token has Browser Rendering permissions
|
||||
- Confirm Account ID is correct
|
||||
|
||||
## License
|
||||
|
||||
MIT License - see [LICENSE](LICENSE) file.
|
||||
|
||||
## Contributing
|
||||
|
||||
Contributions are welcome! Please read our contributing guidelines before submitting PRs at https://github.com/lukaszraczylo/cloudflare-crawl-mcp.
|
||||
|
||||
## Support
|
||||
|
||||
- Open an issue at https://github.com/lukaszraczylo/cloudflare-crawl-mcp/issues
|
||||
- Check Cloudflare's [Browser Rendering Docs](https://developers.cloudflare.com/browser-rendering/) for API details
|
||||
Vendored
+1
@@ -0,0 +1 @@
|
||||
export {};
|
||||
Vendored
+339
@@ -0,0 +1,339 @@
|
||||
import { Server } from "@modelcontextprotocol/sdk/server/index.js";
|
||||
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
|
||||
import { CallToolRequestSchema, ListToolsRequestSchema, } from "@modelcontextprotocol/sdk/types.js";
|
||||
const API_BASE = "https://api.cloudflare.com/client/v4";
|
||||
const MAX_RETRIES = 3;
|
||||
const RATE_LIMIT_DELAY_MS = 10000;
|
||||
let lastRequestTime = 0;
|
||||
let requestCount = 0;
|
||||
let windowStart = Date.now();
|
||||
function getEnv(key) {
|
||||
const value = process.env[key];
|
||||
if (!value) {
|
||||
throw new Error(`Missing required environment variable: ${key}`);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
async function enforceRateLimit() {
|
||||
const now = Date.now();
|
||||
const windowDuration = 60000;
|
||||
if (now - windowStart >= windowDuration) {
|
||||
requestCount = 0;
|
||||
windowStart = now;
|
||||
}
|
||||
const requestsPerMinute = parseInt(process.env.CF_RATE_LIMIT || "6", 10);
|
||||
if (requestCount >= requestsPerMinute) {
|
||||
const waitTime = windowDuration - (now - windowStart);
|
||||
console.error(`Rate limit reached (${requestsPerMinute}/min). Waiting ${waitTime}ms...`);
|
||||
await new Promise((resolve) => setTimeout(resolve, waitTime));
|
||||
requestCount = 0;
|
||||
windowStart = Date.now();
|
||||
}
|
||||
const timeSinceLastRequest = now - lastRequestTime;
|
||||
if (timeSinceLastRequest < RATE_LIMIT_DELAY_MS && requestCount > 0) {
|
||||
const waitTime = RATE_LIMIT_DELAY_MS - timeSinceLastRequest;
|
||||
await new Promise((resolve) => setTimeout(resolve, waitTime));
|
||||
}
|
||||
lastRequestTime = Date.now();
|
||||
requestCount++;
|
||||
}
|
||||
async function fetchWithRetry(fn, retries = MAX_RETRIES) {
|
||||
let lastError = null;
|
||||
for (let attempt = 0; attempt < retries; attempt++) {
|
||||
try {
|
||||
return await fn();
|
||||
}
|
||||
catch (error) {
|
||||
lastError = error;
|
||||
const errorStr = error.message || "";
|
||||
const isRateLimit = errorStr.includes("429") ||
|
||||
errorStr.includes("Rate limit");
|
||||
if (!isRateLimit || attempt === retries - 1) {
|
||||
throw error;
|
||||
}
|
||||
const retryAfterMatch = errorStr.match(/Retry-After[:\s]*(\d+)/i);
|
||||
const delay = retryAfterMatch
|
||||
? parseInt(retryAfterMatch[1], 10) * 1000
|
||||
: Math.min(1000 * Math.pow(2, attempt), 30000);
|
||||
console.error(`Rate limited. Retrying in ${delay}ms...`);
|
||||
await new Promise((resolve) => setTimeout(resolve, delay));
|
||||
}
|
||||
}
|
||||
throw lastError;
|
||||
}
|
||||
async function initiateCrawl(accountId, apiToken, options) {
|
||||
await enforceRateLimit();
|
||||
return fetchWithRetry(async () => {
|
||||
const response = await fetch(`${API_BASE}/accounts/${accountId}/browser-rendering/crawl`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
Authorization: `Bearer ${apiToken}`,
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
url: options.url,
|
||||
limit: options.limit ?? 10,
|
||||
depth: options.depth ?? 1,
|
||||
formats: options.formats ?? ["markdown"],
|
||||
render: options.render ?? true,
|
||||
maxAge: options.maxAge,
|
||||
source: options.source ?? "all",
|
||||
options: options.options ?? {},
|
||||
}),
|
||||
});
|
||||
if (!response.ok) {
|
||||
const error = await response.text();
|
||||
const retryAfter = response.headers.get("Retry-After");
|
||||
const errorMsg = `Failed to initiate crawl: ${response.status} ${error}${retryAfter ? ` Retry-After: ${retryAfter}` : ""}`;
|
||||
throw new Error(errorMsg);
|
||||
}
|
||||
const data = await response.json();
|
||||
if (!data.success) {
|
||||
throw new Error(`Crawl initiation failed: ${JSON.stringify(data.errors)}`);
|
||||
}
|
||||
return data.result.id;
|
||||
});
|
||||
}
|
||||
async function waitForCrawl(accountId, apiToken, jobId, maxAttempts = 120, delayMs = 5000) {
|
||||
for (let i = 0; i < maxAttempts; i++) {
|
||||
const response = await fetch(`${API_BASE}/accounts/${accountId}/browser-rendering/crawl/${jobId}?limit=1`, {
|
||||
headers: {
|
||||
Authorization: `Bearer ${apiToken}`,
|
||||
},
|
||||
});
|
||||
if (!response.ok) {
|
||||
const error = await response.text();
|
||||
throw new Error(`Failed to check crawl status: ${response.status} ${error}`);
|
||||
}
|
||||
const data = await response.json();
|
||||
const status = data.result.status;
|
||||
if (status !== "running") {
|
||||
return data.result;
|
||||
}
|
||||
await new Promise((resolve) => setTimeout(resolve, delayMs));
|
||||
}
|
||||
throw new Error("Crawl job did not complete within timeout");
|
||||
}
|
||||
function buildCrawlOptions(args, formats) {
|
||||
return {
|
||||
url: args.url,
|
||||
limit: args.limit,
|
||||
depth: args.depth,
|
||||
formats,
|
||||
render: args.render,
|
||||
options: {
|
||||
includeExternalLinks: args.includeExternalLinks,
|
||||
includeSubdomains: args.includeSubdomains,
|
||||
includePatterns: args.includePatterns,
|
||||
excludePatterns: args.excludePatterns,
|
||||
},
|
||||
};
|
||||
}
|
||||
function formatMarkdownResult(result) {
|
||||
const records = result.records || [];
|
||||
const completedRecords = records.filter((r) => r.status === "completed");
|
||||
const content = completedRecords
|
||||
.map((record) => {
|
||||
const title = record.metadata?.title || record.url;
|
||||
return `## ${title}\n\nURL: ${record.url}\n\n${record.markdown || ""}\n\n---\n`;
|
||||
})
|
||||
.join("\n");
|
||||
return `Crawl completed: ${completedRecords.length} of ${result.total} pages crawled successfully.\n\n${content}`;
|
||||
}
|
||||
function formatHtmlResult(result) {
|
||||
const records = result.records || [];
|
||||
const completedRecords = records.filter((r) => r.status === "completed");
|
||||
const content = completedRecords
|
||||
.map((record) => {
|
||||
const title = record.metadata?.title || record.url;
|
||||
return `<article>\n <h2>${title}</h2>\n <p>Source: <a href="${record.url}">${record.url}</a></p>\n <div class="content">${record.html || ""}</div>\n</article>\n`;
|
||||
})
|
||||
.join("\n");
|
||||
return `Crawl completed: ${completedRecords.length} of ${result.total} pages crawled successfully.\n\n${content}`;
|
||||
}
|
||||
function formatJsonResult(result) {
|
||||
const records = result.records || [];
|
||||
const completedRecords = records.filter((r) => r.status === "completed");
|
||||
const jsonOutput = {
|
||||
summary: {
|
||||
total: result.total,
|
||||
completed: completedRecords.length,
|
||||
status: result.status,
|
||||
},
|
||||
pages: completedRecords.map((record) => ({
|
||||
url: record.url,
|
||||
title: record.metadata?.title,
|
||||
status: record.metadata?.status,
|
||||
markdown: record.markdown,
|
||||
html: record.html,
|
||||
json: record.json,
|
||||
})),
|
||||
};
|
||||
return JSON.stringify(jsonOutput, null, 2);
|
||||
}
|
||||
function handleErrorResult(result, jobId) {
|
||||
const errorMessages = {
|
||||
errored: `Crawl job errored. Job ID: ${jobId}`,
|
||||
cancelled_due_to_timeout: `Crawl job cancelled due to timeout (7 days max). Job ID: ${jobId}`,
|
||||
cancelled_due_to_limits: `Crawl job cancelled due to account limits. Job ID: ${jobId}`,
|
||||
cancelled_by_user: `Crawl job was cancelled by user. Job ID: ${jobId}`,
|
||||
};
|
||||
const message = errorMessages[result.status] || `Crawl job failed with status: ${result.status}. Job ID: ${jobId}`;
|
||||
return {
|
||||
content: [{ type: "text", text: message }],
|
||||
isError: true,
|
||||
};
|
||||
}
|
||||
const server = new Server({
|
||||
name: "cloudflare-crawl-mcp",
|
||||
version: "1.0.0",
|
||||
}, {
|
||||
capabilities: {
|
||||
tools: {},
|
||||
},
|
||||
});
|
||||
const baseToolSchema = {
|
||||
type: "object",
|
||||
properties: {
|
||||
url: {
|
||||
type: "string",
|
||||
description: "The starting URL to crawl",
|
||||
},
|
||||
limit: {
|
||||
type: "number",
|
||||
description: "Maximum number of pages to crawl (default: 10, max: 100000)",
|
||||
},
|
||||
depth: {
|
||||
type: "number",
|
||||
description: "Maximum link depth to crawl from the starting URL (default: 1)",
|
||||
},
|
||||
includeSubdomains: {
|
||||
type: "boolean",
|
||||
description: "If true, follows links to subdomains of the starting URL (default: false)",
|
||||
},
|
||||
includeExternalLinks: {
|
||||
type: "boolean",
|
||||
description: "If true, follows links to external domains (default: false)",
|
||||
},
|
||||
includePatterns: {
|
||||
type: "array",
|
||||
items: { type: "string" },
|
||||
description: "Only visits URLs that match one of these wildcard patterns",
|
||||
},
|
||||
excludePatterns: {
|
||||
type: "array",
|
||||
items: { type: "string" },
|
||||
description: "Does not visit URLs that match any of these wildcard patterns",
|
||||
},
|
||||
render: {
|
||||
type: "boolean",
|
||||
description: "If false, does a fast HTML fetch without executing JavaScript (default: true)",
|
||||
},
|
||||
},
|
||||
required: ["url"],
|
||||
};
|
||||
const RATE_LIMIT_INFO = `
|
||||
---
|
||||
**Cloudflare Browser Rendering Limits:**
|
||||
|
||||
| Plan | Concurrent Browsers | Browser Time | REST API Rate |
|
||||
|------|---------------------|--------------|---------------|
|
||||
| Free | 3 | 10 min/day | 6 req/min |
|
||||
| Paid | 10 | 10 hours/month | 600 req/min |
|
||||
|
||||
**Environment Variables:**
|
||||
- CF_RATE_LIMIT: Override REST API requests per minute (default: 6 for Free, 600 for Paid)
|
||||
|
||||
**Tips:**
|
||||
- Use \`render: false\` for static content to avoid browser time usage
|
||||
- Use \`maxAge\` to cache results and reduce API calls
|
||||
- Set \`limit\` and \`depth\` appropriately to stay within limits
|
||||
---`;
|
||||
server.setRequestHandler(ListToolsRequestSchema, async () => {
|
||||
return {
|
||||
tools: [
|
||||
{
|
||||
name: "crawl_url_markdown",
|
||||
description: `Crawl a website using Cloudflare Browser Rendering and return content in Markdown format. Supports following links across the site up to a configurable depth or page limit.${RATE_LIMIT_INFO}`,
|
||||
inputSchema: {
|
||||
...baseToolSchema,
|
||||
properties: {
|
||||
...baseToolSchema.properties,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "crawl_url_html",
|
||||
description: `Crawl a website using Cloudflare Browser Rendering and return content in HTML format. Supports following links across the site up to a configurable depth or page limit.${RATE_LIMIT_INFO}`,
|
||||
inputSchema: baseToolSchema,
|
||||
},
|
||||
{
|
||||
name: "crawl_url_json",
|
||||
description: `Crawl a website using Cloudflare Browser Rendering and return content in JSON format. This uses Workers AI for data extraction. Supports following links across the site up to a configurable depth or page limit.${RATE_LIMIT_INFO}`,
|
||||
inputSchema: baseToolSchema,
|
||||
},
|
||||
],
|
||||
};
|
||||
});
|
||||
server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
||||
const { name, arguments: args } = request.params;
|
||||
const toolMatch = name.match(/^crawl_url_(markdown|html|json)$/);
|
||||
if (!toolMatch) {
|
||||
return {
|
||||
content: [{ type: "text", text: `Unknown tool: ${name}` }],
|
||||
isError: true,
|
||||
};
|
||||
}
|
||||
const format = toolMatch[1];
|
||||
const formatMap = {
|
||||
markdown: ["markdown"],
|
||||
html: ["html"],
|
||||
json: ["json"],
|
||||
};
|
||||
const formats = formatMap[format];
|
||||
try {
|
||||
const apiToken = getEnv("CF_API_TOKEN");
|
||||
const accountId = getEnv("CF_ACCOUNT_ID");
|
||||
const crawlArgs = {
|
||||
url: args.url,
|
||||
limit: args.limit,
|
||||
depth: args.depth,
|
||||
includeSubdomains: args.includeSubdomains,
|
||||
includeExternalLinks: args.includeExternalLinks,
|
||||
includePatterns: args.includePatterns,
|
||||
excludePatterns: args.excludePatterns,
|
||||
render: args.render,
|
||||
};
|
||||
const options = buildCrawlOptions(crawlArgs, formats);
|
||||
const jobId = await initiateCrawl(accountId, apiToken, options);
|
||||
const result = await waitForCrawl(accountId, apiToken, jobId);
|
||||
const terminalStatuses = ["errored", "cancelled_due_to_timeout", "cancelled_due_to_limits", "cancelled_by_user"];
|
||||
if (terminalStatuses.includes(result.status)) {
|
||||
return handleErrorResult(result, jobId);
|
||||
}
|
||||
const formatterMap = {
|
||||
markdown: formatMarkdownResult,
|
||||
html: formatHtmlResult,
|
||||
json: formatJsonResult,
|
||||
};
|
||||
const formattedContent = formatterMap[format](result);
|
||||
return {
|
||||
content: [{ type: "text", text: formattedContent }],
|
||||
};
|
||||
}
|
||||
catch (error) {
|
||||
const message = error instanceof Error ? error.message : String(error);
|
||||
return {
|
||||
content: [{ type: "text", text: `Error: ${message}` }],
|
||||
isError: true,
|
||||
};
|
||||
}
|
||||
});
|
||||
async function main() {
|
||||
const transport = new StdioServerTransport();
|
||||
await server.connect(transport);
|
||||
}
|
||||
main().catch((error) => {
|
||||
console.error("Server error:", error);
|
||||
process.exit(1);
|
||||
});
|
||||
Vendored
+1
@@ -0,0 +1 @@
|
||||
export {};
|
||||
Vendored
+628
@@ -0,0 +1,628 @@
|
||||
import { describe, it, expect, beforeEach, vi } from 'vitest';
|
||||
const API_BASE = "https://api.cloudflare.com/client/v4";
|
||||
function getEnv(key) {
|
||||
const value = process.env[key];
|
||||
if (!value) {
|
||||
throw new Error(`Missing required environment variable: ${key}`);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
async function initiateCrawl(accountId, apiToken, options) {
|
||||
const response = await fetch(`${API_BASE}/accounts/${accountId}/browser-rendering/crawl`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
Authorization: `Bearer ${apiToken}`,
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
url: options.url,
|
||||
limit: options.limit ?? 10,
|
||||
depth: options.depth ?? 1,
|
||||
formats: options.formats ?? ["markdown"],
|
||||
render: options.render ?? true,
|
||||
maxAge: options.maxAge,
|
||||
source: options.source ?? "all",
|
||||
options: options.options ?? {},
|
||||
}),
|
||||
});
|
||||
if (!response.ok) {
|
||||
const error = await response.text();
|
||||
throw new Error(`Failed to initiate crawl: ${response.status} ${error}`);
|
||||
}
|
||||
const data = await response.json();
|
||||
if (!data.success) {
|
||||
throw new Error(`Crawl initiation failed: ${JSON.stringify(data.errors)}`);
|
||||
}
|
||||
return data.result.id;
|
||||
}
|
||||
async function waitForCrawl(accountId, apiToken, jobId, maxAttempts = 120, delayMs = 5000) {
|
||||
for (let i = 0; i < maxAttempts; i++) {
|
||||
const response = await fetch(`${API_BASE}/accounts/${accountId}/browser-rendering/crawl/${jobId}?limit=1`, {
|
||||
headers: {
|
||||
Authorization: `Bearer ${apiToken}`,
|
||||
},
|
||||
});
|
||||
if (!response.ok) {
|
||||
const error = await response.text();
|
||||
throw new Error(`Failed to check crawl status: ${response.status} ${error}`);
|
||||
}
|
||||
const data = await response.json();
|
||||
const status = data.result.status;
|
||||
if (status !== "running") {
|
||||
return data.result;
|
||||
}
|
||||
await new Promise((resolve) => setTimeout(resolve, delayMs));
|
||||
}
|
||||
throw new Error("Crawl job did not complete within timeout");
|
||||
}
|
||||
function buildCrawlOptions(args, formats) {
|
||||
return {
|
||||
url: args.url,
|
||||
limit: args.limit,
|
||||
depth: args.depth,
|
||||
formats,
|
||||
render: args.render,
|
||||
options: {
|
||||
includeExternalLinks: args.includeExternalLinks,
|
||||
includeSubdomains: args.includeSubdomains,
|
||||
includePatterns: args.includePatterns,
|
||||
excludePatterns: args.excludePatterns,
|
||||
},
|
||||
};
|
||||
}
|
||||
function formatMarkdownResult(result) {
|
||||
const records = result.records || [];
|
||||
const completedRecords = records.filter((r) => r.status === "completed");
|
||||
const content = completedRecords
|
||||
.map((record) => {
|
||||
const title = record.metadata?.title || record.url;
|
||||
return `## ${title}\n\nURL: ${record.url}\n\n${record.markdown || ""}\n\n---\n`;
|
||||
})
|
||||
.join("\n");
|
||||
return `Crawl completed: ${completedRecords.length} of ${result.total} pages crawled successfully.\n\n${content}`;
|
||||
}
|
||||
function formatHtmlResult(result) {
|
||||
const records = result.records || [];
|
||||
const completedRecords = records.filter((r) => r.status === "completed");
|
||||
const content = completedRecords
|
||||
.map((record) => {
|
||||
const title = record.metadata?.title || record.url;
|
||||
return `<article>\n <h2>${title}</h2>\n <p>Source: <a href="${record.url}">${record.url}</a></p>\n <div class="content">${record.html || ""}</div>\n</article>\n`;
|
||||
})
|
||||
.join("\n");
|
||||
return `Crawl completed: ${completedRecords.length} of ${result.total} pages crawled successfully.\n\n${content}`;
|
||||
}
|
||||
function formatJsonResult(result) {
|
||||
const records = result.records || [];
|
||||
const completedRecords = records.filter((r) => r.status === "completed");
|
||||
const jsonOutput = {
|
||||
summary: {
|
||||
total: result.total,
|
||||
completed: completedRecords.length,
|
||||
status: result.status,
|
||||
},
|
||||
pages: completedRecords.map((record) => ({
|
||||
url: record.url,
|
||||
title: record.metadata?.title,
|
||||
status: record.metadata?.status,
|
||||
markdown: record.markdown,
|
||||
html: record.html,
|
||||
json: record.json,
|
||||
})),
|
||||
};
|
||||
return JSON.stringify(jsonOutput, null, 2);
|
||||
}
|
||||
function handleErrorResult(result, jobId) {
|
||||
const errorMessages = {
|
||||
errored: `Crawl job errored. Job ID: ${jobId}`,
|
||||
cancelled_due_to_timeout: `Crawl job cancelled due to timeout (7 days max). Job ID: ${jobId}`,
|
||||
cancelled_due_to_limits: `Crawl job cancelled due to account limits. Job ID: ${jobId}`,
|
||||
cancelled_by_user: `Crawl job was cancelled by user. Job ID: ${jobId}`,
|
||||
};
|
||||
const message = errorMessages[result.status] || `Crawl job failed with status: ${result.status}. Job ID: ${jobId}`;
|
||||
return {
|
||||
content: [{ type: "text", text: message }],
|
||||
isError: true,
|
||||
};
|
||||
}
|
||||
describe('getEnv', () => {
|
||||
const testCases = [
|
||||
{
|
||||
name: 'returns value when env var exists',
|
||||
envKey: 'TEST_VAR',
|
||||
envValue: 'test-value',
|
||||
expected: 'test-value',
|
||||
},
|
||||
{
|
||||
name: 'throws when env var is empty string',
|
||||
envKey: 'EMPTY_VAR',
|
||||
envValue: '',
|
||||
expectedError: 'Missing required environment variable: EMPTY_VAR',
|
||||
},
|
||||
{
|
||||
name: 'throws when env var is undefined',
|
||||
envKey: 'UNDEFINED_VAR',
|
||||
envValue: undefined,
|
||||
expectedError: 'Missing required environment variable: UNDEFINED_VAR',
|
||||
},
|
||||
];
|
||||
it.each(testCases)('$name', ({ envKey, envValue, expected, expectedError }) => {
|
||||
if (expectedError) {
|
||||
if (envValue === undefined) {
|
||||
delete process.env[envKey];
|
||||
}
|
||||
else {
|
||||
process.env[envKey] = envValue;
|
||||
}
|
||||
expect(() => getEnv(envKey)).toThrow(expectedError);
|
||||
}
|
||||
else {
|
||||
process.env[envKey] = envValue;
|
||||
expect(getEnv(envKey)).toBe(expected);
|
||||
}
|
||||
});
|
||||
});
|
||||
describe('buildCrawlOptions', () => {
|
||||
const testCases = [
|
||||
{
|
||||
name: 'builds options with markdown format',
|
||||
args: { url: 'https://example.com' },
|
||||
formats: ['markdown'],
|
||||
expected: {
|
||||
url: 'https://example.com',
|
||||
limit: undefined,
|
||||
depth: undefined,
|
||||
formats: ['markdown'],
|
||||
render: undefined,
|
||||
options: {
|
||||
includeExternalLinks: undefined,
|
||||
includeSubdomains: undefined,
|
||||
includePatterns: undefined,
|
||||
excludePatterns: undefined,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: 'builds options with all parameters',
|
||||
args: {
|
||||
url: 'https://example.com',
|
||||
limit: 50,
|
||||
depth: 2,
|
||||
includeSubdomains: true,
|
||||
includeExternalLinks: false,
|
||||
includePatterns: ['**/docs/**'],
|
||||
excludePatterns: ['**/archive/**'],
|
||||
render: true,
|
||||
},
|
||||
formats: ['html'],
|
||||
expected: {
|
||||
url: 'https://example.com',
|
||||
limit: 50,
|
||||
depth: 2,
|
||||
formats: ['html'],
|
||||
render: true,
|
||||
options: {
|
||||
includeExternalLinks: false,
|
||||
includeSubdomains: true,
|
||||
includePatterns: ['**/docs/**'],
|
||||
excludePatterns: ['**/archive/**'],
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: 'builds options with json format',
|
||||
args: { url: 'https://api.example.com', limit: 100 },
|
||||
formats: ['json'],
|
||||
expected: {
|
||||
url: 'https://api.example.com',
|
||||
limit: 100,
|
||||
formats: ['json'],
|
||||
depth: undefined,
|
||||
render: undefined,
|
||||
options: {
|
||||
includeExternalLinks: undefined,
|
||||
includeSubdomains: undefined,
|
||||
includePatterns: undefined,
|
||||
excludePatterns: undefined,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: 'handles empty options object',
|
||||
args: { url: 'https://test.com' },
|
||||
formats: ['markdown'],
|
||||
expected: {
|
||||
url: 'https://test.com',
|
||||
formats: ['markdown'],
|
||||
options: {
|
||||
includeExternalLinks: undefined,
|
||||
includeSubdomains: undefined,
|
||||
includePatterns: undefined,
|
||||
excludePatterns: undefined,
|
||||
},
|
||||
},
|
||||
},
|
||||
];
|
||||
it.each(testCases)('$name', ({ args, formats, expected }) => {
|
||||
const result = buildCrawlOptions(args, formats);
|
||||
expect(result).toEqual(expected);
|
||||
});
|
||||
});
|
||||
describe('formatMarkdownResult', () => {
|
||||
const testCases = [
|
||||
{
|
||||
name: 'formats single completed page',
|
||||
result: {
|
||||
total: 1,
|
||||
status: 'completed',
|
||||
records: [
|
||||
{
|
||||
url: 'https://example.com',
|
||||
status: 'completed',
|
||||
markdown: '# Hello World',
|
||||
metadata: { title: 'Home Page', status: 200 },
|
||||
},
|
||||
],
|
||||
},
|
||||
expectedContains: ['## Home Page', '# Hello World', 'Crawl completed: 1 of 1'],
|
||||
},
|
||||
{
|
||||
name: 'formats multiple completed pages',
|
||||
result: {
|
||||
total: 2,
|
||||
status: 'completed',
|
||||
records: [
|
||||
{
|
||||
url: 'https://example.com',
|
||||
status: 'completed',
|
||||
markdown: '# Page 1',
|
||||
metadata: { title: 'Page One', status: 200 },
|
||||
},
|
||||
{
|
||||
url: 'https://example.com/about',
|
||||
status: 'completed',
|
||||
markdown: '# About Us',
|
||||
metadata: { title: 'About', status: 200 },
|
||||
},
|
||||
],
|
||||
},
|
||||
expectedContains: ['## Page One', '## About', 'Crawl completed: 2 of 2'],
|
||||
},
|
||||
{
|
||||
name: 'handles missing markdown content',
|
||||
result: {
|
||||
total: 1,
|
||||
status: 'completed',
|
||||
records: [
|
||||
{
|
||||
url: 'https://example.com',
|
||||
status: 'completed',
|
||||
markdown: '',
|
||||
metadata: { title: 'Test', status: 200 },
|
||||
},
|
||||
],
|
||||
},
|
||||
expectedContains: ['## Test', 'URL: https://example.com'],
|
||||
},
|
||||
{
|
||||
name: 'uses url as title when metadata.title is missing',
|
||||
result: {
|
||||
total: 1,
|
||||
status: 'completed',
|
||||
records: [
|
||||
{
|
||||
url: 'https://example.com/unnamed',
|
||||
status: 'completed',
|
||||
markdown: 'Content here',
|
||||
},
|
||||
],
|
||||
},
|
||||
expectedContains: ['## https://example.com/unnamed', 'Content here'],
|
||||
},
|
||||
{
|
||||
name: 'handles empty records array',
|
||||
result: {
|
||||
total: 0,
|
||||
status: 'completed',
|
||||
records: [],
|
||||
},
|
||||
expectedContains: ['Crawl completed: 0 of 0'],
|
||||
},
|
||||
{
|
||||
name: 'filters out non-completed records',
|
||||
result: {
|
||||
total: 3,
|
||||
status: 'completed',
|
||||
records: [
|
||||
{ url: 'https://example.com/1', status: 'completed', markdown: '# Done' },
|
||||
{ url: 'https://example.com/2', status: 'errored', markdown: '# Failed' },
|
||||
{ url: 'https://example.com/3', status: 'skipped' },
|
||||
],
|
||||
},
|
||||
expectedContains: ['Crawl completed: 1 of 3', '# Done'],
|
||||
},
|
||||
];
|
||||
it.each(testCases)('$name', ({ result, expectedContains }) => {
|
||||
const output = formatMarkdownResult(result);
|
||||
expectedContains.forEach((expected) => {
|
||||
expect(output).toContain(expected);
|
||||
});
|
||||
});
|
||||
});
|
||||
describe('formatHtmlResult', () => {
|
||||
const testCases = [
|
||||
{
|
||||
name: 'formats single completed page with HTML',
|
||||
result: {
|
||||
total: 1,
|
||||
status: 'completed',
|
||||
records: [
|
||||
{
|
||||
url: 'https://example.com',
|
||||
status: 'completed',
|
||||
html: '<p>Hello World</p>',
|
||||
metadata: { title: 'Home Page', status: 200 },
|
||||
},
|
||||
],
|
||||
},
|
||||
expectedContains: ['<h2>Home Page</h2>', '<p>Hello World</p>', 'Crawl completed: 1 of 1'],
|
||||
},
|
||||
{
|
||||
name: 'formats multiple completed pages',
|
||||
result: {
|
||||
total: 2,
|
||||
status: 'completed',
|
||||
records: [
|
||||
{
|
||||
url: 'https://example.com',
|
||||
status: 'completed',
|
||||
html: '<div>Page 1</div>',
|
||||
metadata: { title: 'Page One', status: 200 },
|
||||
},
|
||||
{
|
||||
url: 'https://example.com/about',
|
||||
status: 'completed',
|
||||
html: '<div>About Us</div>',
|
||||
metadata: { title: 'About', status: 200 },
|
||||
},
|
||||
],
|
||||
},
|
||||
expectedContains: ['<h2>Page One</h2>', '<h2>About</h2>', 'Crawl completed: 2 of 2'],
|
||||
},
|
||||
{
|
||||
name: 'handles missing HTML content',
|
||||
result: {
|
||||
total: 1,
|
||||
status: 'completed',
|
||||
records: [
|
||||
{
|
||||
url: 'https://example.com',
|
||||
status: 'completed',
|
||||
html: '',
|
||||
metadata: { title: 'Test', status: 200 },
|
||||
},
|
||||
],
|
||||
},
|
||||
expectedContains: ['<h2>Test</h2>', '<a href="https://example.com">'],
|
||||
},
|
||||
];
|
||||
it.each(testCases)('$name', ({ result, expectedContains }) => {
|
||||
const output = formatHtmlResult(result);
|
||||
expectedContains.forEach((expected) => {
|
||||
expect(output).toContain(expected);
|
||||
});
|
||||
});
|
||||
});
|
||||
describe('formatJsonResult', () => {
|
||||
const testCases = [
|
||||
{
|
||||
name: 'formats single completed page as JSON',
|
||||
result: {
|
||||
total: 1,
|
||||
status: 'completed',
|
||||
records: [
|
||||
{
|
||||
url: 'https://example.com',
|
||||
status: 'completed',
|
||||
markdown: '# Hello',
|
||||
html: '<h1>Hello</h1>',
|
||||
json: { key: 'value' },
|
||||
metadata: { title: 'Home', status: 200 },
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
{
|
||||
name: 'formats multiple completed pages as JSON',
|
||||
result: {
|
||||
total: 2,
|
||||
status: 'completed',
|
||||
records: [
|
||||
{
|
||||
url: 'https://example.com/page1',
|
||||
status: 'completed',
|
||||
markdown: '# Page 1',
|
||||
},
|
||||
{
|
||||
url: 'https://example.com/page2',
|
||||
status: 'completed',
|
||||
markdown: '# Page 2',
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
{
|
||||
name: 'includes summary with correct counts',
|
||||
result: {
|
||||
total: 5,
|
||||
status: 'completed',
|
||||
records: [
|
||||
{ url: 'https://example.com/1', status: 'completed' },
|
||||
{ url: 'https://example.com/2', status: 'completed' },
|
||||
{ url: 'https://example.com/3', status: 'errored' },
|
||||
{ url: 'https://example.com/4', status: 'skipped' },
|
||||
{ url: 'https://example.com/5', status: 'completed' },
|
||||
],
|
||||
},
|
||||
},
|
||||
{
|
||||
name: 'handles empty records',
|
||||
result: {
|
||||
total: 0,
|
||||
status: 'completed',
|
||||
records: [],
|
||||
},
|
||||
},
|
||||
];
|
||||
it.each(testCases)('$name', ({ result }) => {
|
||||
const output = formatJsonResult(result);
|
||||
const parsed = JSON.parse(output);
|
||||
expect(parsed).toHaveProperty('summary');
|
||||
expect(parsed).toHaveProperty('pages');
|
||||
const completedCount = result.records.filter((r) => r.status === 'completed').length;
|
||||
expect(parsed.summary.completed).toBe(completedCount);
|
||||
expect(parsed.summary.total).toBe(result.total);
|
||||
expect(parsed.summary.status).toBe(result.status);
|
||||
});
|
||||
});
|
||||
describe('handleErrorResult', () => {
|
||||
const testCases = [
|
||||
{
|
||||
name: 'handles errored status',
|
||||
result: { status: 'errored' },
|
||||
jobId: 'test-job-123',
|
||||
expectedError: true,
|
||||
expectedContains: ['errored', 'test-job-123'],
|
||||
},
|
||||
{
|
||||
name: 'handles cancelled_due_to_timeout status',
|
||||
result: { status: 'cancelled_due_to_timeout' },
|
||||
jobId: 'job-456',
|
||||
expectedError: true,
|
||||
expectedContains: ['timeout', 'job-456'],
|
||||
},
|
||||
{
|
||||
name: 'handles cancelled_due_to_limits status',
|
||||
result: { status: 'cancelled_due_to_limits' },
|
||||
jobId: 'job-789',
|
||||
expectedError: true,
|
||||
expectedContains: ['limits', 'job-789'],
|
||||
},
|
||||
{
|
||||
name: 'handles cancelled_by_user status',
|
||||
result: { status: 'cancelled_by_user' },
|
||||
jobId: 'job-000',
|
||||
expectedError: true,
|
||||
expectedContains: ['cancelled by user', 'job-000'],
|
||||
},
|
||||
{
|
||||
name: 'handles unknown status',
|
||||
result: { status: 'some_unknown_status' },
|
||||
jobId: 'job-unknown',
|
||||
expectedError: true,
|
||||
expectedContains: ['some_unknown_status', 'job-unknown'],
|
||||
},
|
||||
];
|
||||
it.each(testCases)('$name', ({ result, jobId, expectedError, expectedContains }) => {
|
||||
const output = handleErrorResult(result, jobId);
|
||||
expect(output.isError).toBe(expectedError);
|
||||
expectedContains.forEach((expected) => {
|
||||
expect(output.content[0].text).toContain(expected);
|
||||
});
|
||||
});
|
||||
});
|
||||
describe('initiateCrawl', () => {
|
||||
beforeEach(() => {
|
||||
vi.stubGlobal('fetch', vi.fn());
|
||||
});
|
||||
const testCases = [
|
||||
{
|
||||
name: 'initiates crawl successfully',
|
||||
accountId: 'acc-123',
|
||||
apiToken: 'token-abc',
|
||||
options: { url: 'https://example.com', formats: ['markdown'] },
|
||||
mockResponse: { success: true, result: { id: 'job-123' } },
|
||||
expectedJobId: 'job-123',
|
||||
},
|
||||
{
|
||||
name: 'throws on HTTP error',
|
||||
accountId: 'acc-123',
|
||||
apiToken: 'token-abc',
|
||||
options: { url: 'https://example.com' },
|
||||
mockResponse: null,
|
||||
mockStatus: 401,
|
||||
expectedError: 'Failed to initiate crawl: 401',
|
||||
},
|
||||
{
|
||||
name: 'throws on API failure',
|
||||
accountId: 'acc-123',
|
||||
apiToken: 'token-abc',
|
||||
options: { url: 'https://example.com' },
|
||||
mockResponse: { success: false, errors: [{ message: 'Invalid URL' }] },
|
||||
expectedError: 'Crawl initiation failed',
|
||||
},
|
||||
];
|
||||
it.each(testCases)('$name', async ({ accountId, apiToken, options, mockResponse, mockStatus, expectedJobId, expectedError }) => {
|
||||
const fetchMock = vi.mocked(fetch);
|
||||
if (expectedError) {
|
||||
if (mockStatus) {
|
||||
fetchMock.mockResolvedValueOnce(new Response('', { status: mockStatus }));
|
||||
}
|
||||
else {
|
||||
fetchMock.mockResolvedValueOnce(new Response(JSON.stringify(mockResponse), {
|
||||
status: 200,
|
||||
headers: { 'Content-Type': 'application/json' }
|
||||
}));
|
||||
}
|
||||
await expect(initiateCrawl(accountId, apiToken, options)).rejects.toThrow(expectedError);
|
||||
}
|
||||
else {
|
||||
fetchMock.mockResolvedValueOnce(new Response(JSON.stringify(mockResponse), {
|
||||
status: 200,
|
||||
headers: { 'Content-Type': 'application/json' }
|
||||
}));
|
||||
const result = await initiateCrawl(accountId, apiToken, options);
|
||||
expect(result).toBe(expectedJobId);
|
||||
}
|
||||
});
|
||||
});
|
||||
describe('waitForCrawl', () => {
|
||||
beforeEach(() => {
|
||||
vi.stubGlobal('fetch', vi.fn());
|
||||
});
|
||||
const testCases = [
|
||||
{
|
||||
name: 'returns completed result immediately',
|
||||
accountId: 'acc-123',
|
||||
apiToken: 'token-abc',
|
||||
jobId: 'job-123',
|
||||
mockResponse: { result: { status: 'completed', total: 5, records: [] } },
|
||||
expectedStatus: 'completed',
|
||||
},
|
||||
{
|
||||
name: 'returns errored result',
|
||||
accountId: 'acc-123',
|
||||
apiToken: 'token-abc',
|
||||
jobId: 'job-123',
|
||||
mockResponse: { result: { status: 'errored', error: 'Something went wrong' } },
|
||||
expectedStatus: 'errored',
|
||||
},
|
||||
{
|
||||
name: 'returns cancelled_due_to_limits result',
|
||||
accountId: 'acc-123',
|
||||
apiToken: 'token-abc',
|
||||
jobId: 'job-123',
|
||||
mockResponse: { result: { status: 'cancelled_due_to_limits' } },
|
||||
expectedStatus: 'cancelled_due_to_limits',
|
||||
},
|
||||
];
|
||||
it.each(testCases)('$name', async ({ accountId, apiToken, jobId, mockResponse, expectedStatus }) => {
|
||||
const fetchMock = vi.mocked(fetch);
|
||||
fetchMock.mockResolvedValue(new Response(JSON.stringify(mockResponse), {
|
||||
status: 200,
|
||||
headers: { 'Content-Type': 'application/json' }
|
||||
}));
|
||||
const result = await waitForCrawl(accountId, apiToken, jobId, 1, 1);
|
||||
expect(result.status).toBe(expectedStatus);
|
||||
});
|
||||
});
|
||||
Vendored
+1
@@ -0,0 +1 @@
|
||||
export {};
|
||||
Vendored
+134
@@ -0,0 +1,134 @@
|
||||
import { describe, it, expect, beforeAll } from 'vitest';
|
||||
const API_BASE = "https://api.cloudflare.com/client/v4";
|
||||
async function initiateCrawl(accountId, apiToken, options) {
|
||||
const response = await fetch(`${API_BASE}/accounts/${accountId}/browser-rendering/crawl`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
Authorization: `Bearer ${apiToken}`,
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
url: options.url,
|
||||
limit: options.limit ?? 10,
|
||||
depth: options.depth ?? 1,
|
||||
formats: options.formats ?? ["markdown"],
|
||||
render: options.render ?? true,
|
||||
options: options.options ?? {},
|
||||
}),
|
||||
});
|
||||
if (!response.ok) {
|
||||
const error = await response.text();
|
||||
throw new Error(`Failed to initiate crawl: ${response.status} ${error}`);
|
||||
}
|
||||
const data = await response.json();
|
||||
if (!data.success) {
|
||||
throw new Error(`Crawl initiation failed: ${JSON.stringify(data.errors)}`);
|
||||
}
|
||||
return data.result.id;
|
||||
}
|
||||
async function waitForCrawl(accountId, apiToken, jobId, maxAttempts = 60, delayMs = 5000) {
|
||||
for (let i = 0; i < maxAttempts; i++) {
|
||||
const response = await fetch(`${API_BASE}/accounts/${accountId}/browser-rendering/crawl/${jobId}?limit=1`, {
|
||||
headers: {
|
||||
Authorization: `Bearer ${apiToken}`,
|
||||
},
|
||||
});
|
||||
if (!response.ok) {
|
||||
const error = await response.text();
|
||||
throw new Error(`Failed to check crawl status: ${response.status} ${error}`);
|
||||
}
|
||||
const data = await response.json();
|
||||
const status = data.result.status;
|
||||
if (status !== "running") {
|
||||
return data.result;
|
||||
}
|
||||
await new Promise((resolve) => setTimeout(resolve, delayMs));
|
||||
}
|
||||
throw new Error("Crawl job did not complete within timeout");
|
||||
}
|
||||
function getEnv(key) {
|
||||
const value = process.env[key];
|
||||
if (!value) {
|
||||
throw new Error(`Missing required environment variable: ${key}`);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
describe('Integration: Cloudflare Crawl API', () => {
|
||||
const apiToken = process.env.CF_API_TOKEN;
|
||||
const accountId = process.env.CF_ACCOUNT_ID;
|
||||
const hasCredentials = apiToken && accountId;
|
||||
beforeAll(() => {
|
||||
if (!hasCredentials) {
|
||||
console.log('\n⚠️ Skipping integration tests - CF_API_TOKEN or CF_ACCOUNT_ID not set\n');
|
||||
}
|
||||
});
|
||||
it.skipIf(!hasCredentials)('should crawl raczylo.com with multiple pages in markdown format', async () => {
|
||||
const accountId = getEnv("CF_ACCOUNT_ID");
|
||||
const apiToken = getEnv("CF_API_TOKEN");
|
||||
try {
|
||||
const jobId = await initiateCrawl(accountId, apiToken, {
|
||||
url: "https://raczylo.com",
|
||||
limit: 5,
|
||||
depth: 2,
|
||||
formats: ["markdown"],
|
||||
});
|
||||
console.log(` Started crawl job: ${jobId}`);
|
||||
expect(jobId).toBeDefined();
|
||||
expect(typeof jobId).toBe("string");
|
||||
const result = await waitForCrawl(accountId, apiToken, jobId, 60, 5000);
|
||||
console.log(` Crawl status: ${result.status}`);
|
||||
console.log(` Total pages discovered: ${result.total}`);
|
||||
console.log(` Pages finished: ${result.finished}`);
|
||||
expect(result.status).toBe("completed");
|
||||
expect(result.total).toBeGreaterThan(0);
|
||||
expect(result.records).toBeDefined();
|
||||
expect(Array.isArray(result.records)).toBe(true);
|
||||
expect(result.records.length).toBeGreaterThan(0);
|
||||
const completedRecords = result.records.filter((r) => r.status === "completed");
|
||||
console.log(` Completed pages: ${completedRecords.length}`);
|
||||
completedRecords.forEach((record, index) => {
|
||||
expect(record.url).toBeDefined();
|
||||
expect(record.markdown).toBeDefined();
|
||||
expect(record.markdown.length).toBeGreaterThan(0);
|
||||
console.log(` Page ${index + 1}: ${record.url} (${record.markdown.length} chars)`);
|
||||
});
|
||||
const firstRecord = result.records[0];
|
||||
expect(firstRecord.markdown).toContain("#");
|
||||
}
|
||||
catch (error) {
|
||||
if (error.message.includes("Rate limit")) {
|
||||
console.log(" ⚠️ Skipped - Rate limit exceeded");
|
||||
return;
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
}, 360000);
|
||||
});
|
||||
describe('Environment Variable Validation', () => {
|
||||
const testCases = [
|
||||
{
|
||||
name: 'CF_API_TOKEN is required',
|
||||
envKey: 'CF_API_TOKEN',
|
||||
expectedError: 'Missing required environment variable: CF_API_TOKEN',
|
||||
},
|
||||
{
|
||||
name: 'CF_ACCOUNT_ID is required',
|
||||
envKey: 'CF_ACCOUNT_ID',
|
||||
expectedError: 'Missing required environment variable: CF_ACCOUNT_ID',
|
||||
},
|
||||
];
|
||||
it.each(testCases)('$name', ({ envKey, expectedError }) => {
|
||||
delete process.env[envKey];
|
||||
expect(() => getEnv(envKey)).toThrow(expectedError);
|
||||
});
|
||||
it('should return value when CF_API_TOKEN is set', () => {
|
||||
process.env.CF_API_TOKEN = 'test-token';
|
||||
expect(getEnv('CF_API_TOKEN')).toBe('test-token');
|
||||
delete process.env.CF_API_TOKEN;
|
||||
});
|
||||
it('should return value when CF_ACCOUNT_ID is set', () => {
|
||||
process.env.CF_ACCOUNT_ID = 'test-account';
|
||||
expect(getEnv('CF_ACCOUNT_ID')).toBe('test-account');
|
||||
delete process.env.CF_ACCOUNT_ID;
|
||||
});
|
||||
});
|
||||
Generated
+6926
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,26 @@
|
||||
{
|
||||
"name": "@lukaszraczylo/cloudflare-crawl-mcp",
|
||||
"version": "1.0.0",
|
||||
"description": "MCP server for Cloudflare Browser Rendering Crawl API",
|
||||
"author": "Lukasz Raczylo <hello@raczylo.com> (https://raczylo.com)",
|
||||
"main": "dist/index.js",
|
||||
"type": "module",
|
||||
"scripts": {
|
||||
"build": "tsc",
|
||||
"start": "node dist/index.js",
|
||||
"test": "vitest run",
|
||||
"test:watch": "vitest",
|
||||
"prepublishOnly": "npm run build"
|
||||
},
|
||||
"dependencies": {
|
||||
"@modelcontextprotocol/sdk": "^1.27.1"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/node": "^25.4.0",
|
||||
"typescript": "^5.9.3",
|
||||
"vitest": "^4.0.18"
|
||||
},
|
||||
"publishConfig": {
|
||||
"access": "public"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,724 @@
|
||||
import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest';
|
||||
|
||||
const API_BASE = "https://api.cloudflare.com/client/v4";
|
||||
|
||||
interface CrawlOptions {
|
||||
url: string;
|
||||
limit?: number;
|
||||
depth?: number;
|
||||
formats?: string[];
|
||||
render?: boolean;
|
||||
maxAge?: number;
|
||||
source?: string;
|
||||
options?: {
|
||||
includeExternalLinks?: boolean;
|
||||
includeSubdomains?: boolean;
|
||||
includePatterns?: string[];
|
||||
excludePatterns?: string[];
|
||||
};
|
||||
}
|
||||
|
||||
function getEnv(key: string): string {
|
||||
const value = process.env[key];
|
||||
if (!value) {
|
||||
throw new Error(`Missing required environment variable: ${key}`);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
async function initiateCrawl(
|
||||
accountId: string,
|
||||
apiToken: string,
|
||||
options: CrawlOptions
|
||||
): Promise<string> {
|
||||
const response = await fetch(
|
||||
`${API_BASE}/accounts/${accountId}/browser-rendering/crawl`,
|
||||
{
|
||||
method: "POST",
|
||||
headers: {
|
||||
Authorization: `Bearer ${apiToken}`,
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
url: options.url,
|
||||
limit: options.limit ?? 10,
|
||||
depth: options.depth ?? 1,
|
||||
formats: options.formats ?? ["markdown"],
|
||||
render: options.render ?? true,
|
||||
maxAge: options.maxAge,
|
||||
source: options.source ?? "all",
|
||||
options: options.options ?? {},
|
||||
}),
|
||||
}
|
||||
);
|
||||
|
||||
if (!response.ok) {
|
||||
const error = await response.text();
|
||||
throw new Error(`Failed to initiate crawl: ${response.status} ${error}`);
|
||||
}
|
||||
|
||||
const data = await response.json();
|
||||
if (!data.success) {
|
||||
throw new Error(`Crawl initiation failed: ${JSON.stringify(data.errors)}`);
|
||||
}
|
||||
|
||||
return data.result.id;
|
||||
}
|
||||
|
||||
async function waitForCrawl(
|
||||
accountId: string,
|
||||
apiToken: string,
|
||||
jobId: string,
|
||||
maxAttempts: number = 120,
|
||||
delayMs: number = 5000
|
||||
): Promise<any> {
|
||||
for (let i = 0; i < maxAttempts; i++) {
|
||||
const response = await fetch(
|
||||
`${API_BASE}/accounts/${accountId}/browser-rendering/crawl/${jobId}?limit=1`,
|
||||
{
|
||||
headers: {
|
||||
Authorization: `Bearer ${apiToken}`,
|
||||
},
|
||||
}
|
||||
);
|
||||
|
||||
if (!response.ok) {
|
||||
const error = await response.text();
|
||||
throw new Error(`Failed to check crawl status: ${response.status} ${error}`);
|
||||
}
|
||||
|
||||
const data = await response.json();
|
||||
const status = data.result.status;
|
||||
|
||||
if (status !== "running") {
|
||||
return data.result;
|
||||
}
|
||||
|
||||
await new Promise((resolve) => setTimeout(resolve, delayMs));
|
||||
}
|
||||
|
||||
throw new Error("Crawl job did not complete within timeout");
|
||||
}
|
||||
|
||||
interface CrawlArgs {
|
||||
url: string;
|
||||
limit?: number;
|
||||
depth?: number;
|
||||
includeSubdomains?: boolean;
|
||||
includeExternalLinks?: boolean;
|
||||
includePatterns?: string[];
|
||||
excludePatterns?: string[];
|
||||
render?: boolean;
|
||||
}
|
||||
|
||||
function buildCrawlOptions(args: CrawlArgs, formats: string[]): CrawlOptions {
|
||||
return {
|
||||
url: args.url,
|
||||
limit: args.limit,
|
||||
depth: args.depth,
|
||||
formats,
|
||||
render: args.render,
|
||||
options: {
|
||||
includeExternalLinks: args.includeExternalLinks,
|
||||
includeSubdomains: args.includeSubdomains,
|
||||
includePatterns: args.includePatterns,
|
||||
excludePatterns: args.excludePatterns,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
function formatMarkdownResult(result: any): string {
|
||||
const records = result.records || [];
|
||||
const completedRecords = records.filter((r: any) => r.status === "completed");
|
||||
|
||||
const content = completedRecords
|
||||
.map((record: any) => {
|
||||
const title = record.metadata?.title || record.url;
|
||||
return `## ${title}\n\nURL: ${record.url}\n\n${record.markdown || ""}\n\n---\n`;
|
||||
})
|
||||
.join("\n");
|
||||
|
||||
return `Crawl completed: ${completedRecords.length} of ${result.total} pages crawled successfully.\n\n${content}`;
|
||||
}
|
||||
|
||||
function formatHtmlResult(result: any): string {
|
||||
const records = result.records || [];
|
||||
const completedRecords = records.filter((r: any) => r.status === "completed");
|
||||
|
||||
const content = completedRecords
|
||||
.map((record: any) => {
|
||||
const title = record.metadata?.title || record.url;
|
||||
return `<article>\n <h2>${title}</h2>\n <p>Source: <a href="${record.url}">${record.url}</a></p>\n <div class="content">${record.html || ""}</div>\n</article>\n`;
|
||||
})
|
||||
.join("\n");
|
||||
|
||||
return `Crawl completed: ${completedRecords.length} of ${result.total} pages crawled successfully.\n\n${content}`;
|
||||
}
|
||||
|
||||
function formatJsonResult(result: any): string {
|
||||
const records = result.records || [];
|
||||
const completedRecords = records.filter((r: any) => r.status === "completed");
|
||||
|
||||
const jsonOutput = {
|
||||
summary: {
|
||||
total: result.total,
|
||||
completed: completedRecords.length,
|
||||
status: result.status,
|
||||
},
|
||||
pages: completedRecords.map((record: any) => ({
|
||||
url: record.url,
|
||||
title: record.metadata?.title,
|
||||
status: record.metadata?.status,
|
||||
markdown: record.markdown,
|
||||
html: record.html,
|
||||
json: record.json,
|
||||
})),
|
||||
};
|
||||
|
||||
return JSON.stringify(jsonOutput, null, 2);
|
||||
}
|
||||
|
||||
function handleErrorResult(result: any, jobId: string): { content: any[]; isError: boolean } {
|
||||
const errorMessages: Record<string, string> = {
|
||||
errored: `Crawl job errored. Job ID: ${jobId}`,
|
||||
cancelled_due_to_timeout: `Crawl job cancelled due to timeout (7 days max). Job ID: ${jobId}`,
|
||||
cancelled_due_to_limits: `Crawl job cancelled due to account limits. Job ID: ${jobId}`,
|
||||
cancelled_by_user: `Crawl job was cancelled by user. Job ID: ${jobId}`,
|
||||
};
|
||||
|
||||
const message = errorMessages[result.status] || `Crawl job failed with status: ${result.status}. Job ID: ${jobId}`;
|
||||
|
||||
return {
|
||||
content: [{ type: "text" as const, text: message }],
|
||||
isError: true,
|
||||
};
|
||||
}
|
||||
|
||||
describe('getEnv', () => {
|
||||
const testCases = [
|
||||
{
|
||||
name: 'returns value when env var exists',
|
||||
envKey: 'TEST_VAR',
|
||||
envValue: 'test-value',
|
||||
expected: 'test-value',
|
||||
},
|
||||
{
|
||||
name: 'throws when env var is empty string',
|
||||
envKey: 'EMPTY_VAR',
|
||||
envValue: '',
|
||||
expectedError: 'Missing required environment variable: EMPTY_VAR',
|
||||
},
|
||||
{
|
||||
name: 'throws when env var is undefined',
|
||||
envKey: 'UNDEFINED_VAR',
|
||||
envValue: undefined,
|
||||
expectedError: 'Missing required environment variable: UNDEFINED_VAR',
|
||||
},
|
||||
];
|
||||
|
||||
it.each(testCases)('$name', ({ envKey, envValue, expected, expectedError }) => {
|
||||
if (expectedError) {
|
||||
if (envValue === undefined) {
|
||||
delete process.env[envKey];
|
||||
} else {
|
||||
process.env[envKey] = envValue;
|
||||
}
|
||||
expect(() => getEnv(envKey)).toThrow(expectedError);
|
||||
} else {
|
||||
process.env[envKey] = envValue;
|
||||
expect(getEnv(envKey)).toBe(expected);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
describe('buildCrawlOptions', () => {
|
||||
const testCases = [
|
||||
{
|
||||
name: 'builds options with markdown format',
|
||||
args: { url: 'https://example.com' },
|
||||
formats: ['markdown'],
|
||||
expected: {
|
||||
url: 'https://example.com',
|
||||
limit: undefined,
|
||||
depth: undefined,
|
||||
formats: ['markdown'],
|
||||
render: undefined,
|
||||
options: {
|
||||
includeExternalLinks: undefined,
|
||||
includeSubdomains: undefined,
|
||||
includePatterns: undefined,
|
||||
excludePatterns: undefined,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: 'builds options with all parameters',
|
||||
args: {
|
||||
url: 'https://example.com',
|
||||
limit: 50,
|
||||
depth: 2,
|
||||
includeSubdomains: true,
|
||||
includeExternalLinks: false,
|
||||
includePatterns: ['**/docs/**'],
|
||||
excludePatterns: ['**/archive/**'],
|
||||
render: true,
|
||||
},
|
||||
formats: ['html'],
|
||||
expected: {
|
||||
url: 'https://example.com',
|
||||
limit: 50,
|
||||
depth: 2,
|
||||
formats: ['html'],
|
||||
render: true,
|
||||
options: {
|
||||
includeExternalLinks: false,
|
||||
includeSubdomains: true,
|
||||
includePatterns: ['**/docs/**'],
|
||||
excludePatterns: ['**/archive/**'],
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: 'builds options with json format',
|
||||
args: { url: 'https://api.example.com', limit: 100 },
|
||||
formats: ['json'],
|
||||
expected: {
|
||||
url: 'https://api.example.com',
|
||||
limit: 100,
|
||||
formats: ['json'],
|
||||
depth: undefined,
|
||||
render: undefined,
|
||||
options: {
|
||||
includeExternalLinks: undefined,
|
||||
includeSubdomains: undefined,
|
||||
includePatterns: undefined,
|
||||
excludePatterns: undefined,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: 'handles empty options object',
|
||||
args: { url: 'https://test.com' },
|
||||
formats: ['markdown'],
|
||||
expected: {
|
||||
url: 'https://test.com',
|
||||
formats: ['markdown'],
|
||||
options: {
|
||||
includeExternalLinks: undefined,
|
||||
includeSubdomains: undefined,
|
||||
includePatterns: undefined,
|
||||
excludePatterns: undefined,
|
||||
},
|
||||
},
|
||||
},
|
||||
];
|
||||
|
||||
it.each(testCases)('$name', ({ args, formats, expected }) => {
|
||||
const result = buildCrawlOptions(args, formats);
|
||||
expect(result).toEqual(expected);
|
||||
});
|
||||
});
|
||||
|
||||
describe('formatMarkdownResult', () => {
|
||||
const testCases = [
|
||||
{
|
||||
name: 'formats single completed page',
|
||||
result: {
|
||||
total: 1,
|
||||
status: 'completed',
|
||||
records: [
|
||||
{
|
||||
url: 'https://example.com',
|
||||
status: 'completed',
|
||||
markdown: '# Hello World',
|
||||
metadata: { title: 'Home Page', status: 200 },
|
||||
},
|
||||
],
|
||||
},
|
||||
expectedContains: ['## Home Page', '# Hello World', 'Crawl completed: 1 of 1'],
|
||||
},
|
||||
{
|
||||
name: 'formats multiple completed pages',
|
||||
result: {
|
||||
total: 2,
|
||||
status: 'completed',
|
||||
records: [
|
||||
{
|
||||
url: 'https://example.com',
|
||||
status: 'completed',
|
||||
markdown: '# Page 1',
|
||||
metadata: { title: 'Page One', status: 200 },
|
||||
},
|
||||
{
|
||||
url: 'https://example.com/about',
|
||||
status: 'completed',
|
||||
markdown: '# About Us',
|
||||
metadata: { title: 'About', status: 200 },
|
||||
},
|
||||
],
|
||||
},
|
||||
expectedContains: ['## Page One', '## About', 'Crawl completed: 2 of 2'],
|
||||
},
|
||||
{
|
||||
name: 'handles missing markdown content',
|
||||
result: {
|
||||
total: 1,
|
||||
status: 'completed',
|
||||
records: [
|
||||
{
|
||||
url: 'https://example.com',
|
||||
status: 'completed',
|
||||
markdown: '',
|
||||
metadata: { title: 'Test', status: 200 },
|
||||
},
|
||||
],
|
||||
},
|
||||
expectedContains: ['## Test', 'URL: https://example.com'],
|
||||
},
|
||||
{
|
||||
name: 'uses url as title when metadata.title is missing',
|
||||
result: {
|
||||
total: 1,
|
||||
status: 'completed',
|
||||
records: [
|
||||
{
|
||||
url: 'https://example.com/unnamed',
|
||||
status: 'completed',
|
||||
markdown: 'Content here',
|
||||
},
|
||||
],
|
||||
},
|
||||
expectedContains: ['## https://example.com/unnamed', 'Content here'],
|
||||
},
|
||||
{
|
||||
name: 'handles empty records array',
|
||||
result: {
|
||||
total: 0,
|
||||
status: 'completed',
|
||||
records: [],
|
||||
},
|
||||
expectedContains: ['Crawl completed: 0 of 0'],
|
||||
},
|
||||
{
|
||||
name: 'filters out non-completed records',
|
||||
result: {
|
||||
total: 3,
|
||||
status: 'completed',
|
||||
records: [
|
||||
{ url: 'https://example.com/1', status: 'completed', markdown: '# Done' },
|
||||
{ url: 'https://example.com/2', status: 'errored', markdown: '# Failed' },
|
||||
{ url: 'https://example.com/3', status: 'skipped' },
|
||||
],
|
||||
},
|
||||
expectedContains: ['Crawl completed: 1 of 3', '# Done'],
|
||||
},
|
||||
];
|
||||
|
||||
it.each(testCases)('$name', ({ result, expectedContains }) => {
|
||||
const output = formatMarkdownResult(result);
|
||||
expectedContains.forEach((expected) => {
|
||||
expect(output).toContain(expected);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe('formatHtmlResult', () => {
|
||||
const testCases = [
|
||||
{
|
||||
name: 'formats single completed page with HTML',
|
||||
result: {
|
||||
total: 1,
|
||||
status: 'completed',
|
||||
records: [
|
||||
{
|
||||
url: 'https://example.com',
|
||||
status: 'completed',
|
||||
html: '<p>Hello World</p>',
|
||||
metadata: { title: 'Home Page', status: 200 },
|
||||
},
|
||||
],
|
||||
},
|
||||
expectedContains: ['<h2>Home Page</h2>', '<p>Hello World</p>', 'Crawl completed: 1 of 1'],
|
||||
},
|
||||
{
|
||||
name: 'formats multiple completed pages',
|
||||
result: {
|
||||
total: 2,
|
||||
status: 'completed',
|
||||
records: [
|
||||
{
|
||||
url: 'https://example.com',
|
||||
status: 'completed',
|
||||
html: '<div>Page 1</div>',
|
||||
metadata: { title: 'Page One', status: 200 },
|
||||
},
|
||||
{
|
||||
url: 'https://example.com/about',
|
||||
status: 'completed',
|
||||
html: '<div>About Us</div>',
|
||||
metadata: { title: 'About', status: 200 },
|
||||
},
|
||||
],
|
||||
},
|
||||
expectedContains: ['<h2>Page One</h2>', '<h2>About</h2>', 'Crawl completed: 2 of 2'],
|
||||
},
|
||||
{
|
||||
name: 'handles missing HTML content',
|
||||
result: {
|
||||
total: 1,
|
||||
status: 'completed',
|
||||
records: [
|
||||
{
|
||||
url: 'https://example.com',
|
||||
status: 'completed',
|
||||
html: '',
|
||||
metadata: { title: 'Test', status: 200 },
|
||||
},
|
||||
],
|
||||
},
|
||||
expectedContains: ['<h2>Test</h2>', '<a href="https://example.com">'],
|
||||
},
|
||||
];
|
||||
|
||||
it.each(testCases)('$name', ({ result, expectedContains }) => {
|
||||
const output = formatHtmlResult(result);
|
||||
expectedContains.forEach((expected) => {
|
||||
expect(output).toContain(expected);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe('formatJsonResult', () => {
|
||||
const testCases = [
|
||||
{
|
||||
name: 'formats single completed page as JSON',
|
||||
result: {
|
||||
total: 1,
|
||||
status: 'completed',
|
||||
records: [
|
||||
{
|
||||
url: 'https://example.com',
|
||||
status: 'completed',
|
||||
markdown: '# Hello',
|
||||
html: '<h1>Hello</h1>',
|
||||
json: { key: 'value' },
|
||||
metadata: { title: 'Home', status: 200 },
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
{
|
||||
name: 'formats multiple completed pages as JSON',
|
||||
result: {
|
||||
total: 2,
|
||||
status: 'completed',
|
||||
records: [
|
||||
{
|
||||
url: 'https://example.com/page1',
|
||||
status: 'completed',
|
||||
markdown: '# Page 1',
|
||||
},
|
||||
{
|
||||
url: 'https://example.com/page2',
|
||||
status: 'completed',
|
||||
markdown: '# Page 2',
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
{
|
||||
name: 'includes summary with correct counts',
|
||||
result: {
|
||||
total: 5,
|
||||
status: 'completed',
|
||||
records: [
|
||||
{ url: 'https://example.com/1', status: 'completed' },
|
||||
{ url: 'https://example.com/2', status: 'completed' },
|
||||
{ url: 'https://example.com/3', status: 'errored' },
|
||||
{ url: 'https://example.com/4', status: 'skipped' },
|
||||
{ url: 'https://example.com/5', status: 'completed' },
|
||||
],
|
||||
},
|
||||
},
|
||||
{
|
||||
name: 'handles empty records',
|
||||
result: {
|
||||
total: 0,
|
||||
status: 'completed',
|
||||
records: [],
|
||||
},
|
||||
},
|
||||
];
|
||||
|
||||
it.each(testCases)('$name', ({ result }) => {
|
||||
const output = formatJsonResult(result);
|
||||
const parsed = JSON.parse(output);
|
||||
|
||||
expect(parsed).toHaveProperty('summary');
|
||||
expect(parsed).toHaveProperty('pages');
|
||||
|
||||
const completedCount = result.records.filter((r: any) => r.status === 'completed').length;
|
||||
expect(parsed.summary.completed).toBe(completedCount);
|
||||
expect(parsed.summary.total).toBe(result.total);
|
||||
expect(parsed.summary.status).toBe(result.status);
|
||||
});
|
||||
});
|
||||
|
||||
describe('handleErrorResult', () => {
|
||||
const testCases = [
|
||||
{
|
||||
name: 'handles errored status',
|
||||
result: { status: 'errored' },
|
||||
jobId: 'test-job-123',
|
||||
expectedError: true,
|
||||
expectedContains: ['errored', 'test-job-123'],
|
||||
},
|
||||
{
|
||||
name: 'handles cancelled_due_to_timeout status',
|
||||
result: { status: 'cancelled_due_to_timeout' },
|
||||
jobId: 'job-456',
|
||||
expectedError: true,
|
||||
expectedContains: ['timeout', 'job-456'],
|
||||
},
|
||||
{
|
||||
name: 'handles cancelled_due_to_limits status',
|
||||
result: { status: 'cancelled_due_to_limits' },
|
||||
jobId: 'job-789',
|
||||
expectedError: true,
|
||||
expectedContains: ['limits', 'job-789'],
|
||||
},
|
||||
{
|
||||
name: 'handles cancelled_by_user status',
|
||||
result: { status: 'cancelled_by_user' },
|
||||
jobId: 'job-000',
|
||||
expectedError: true,
|
||||
expectedContains: ['cancelled by user', 'job-000'],
|
||||
},
|
||||
{
|
||||
name: 'handles unknown status',
|
||||
result: { status: 'some_unknown_status' },
|
||||
jobId: 'job-unknown',
|
||||
expectedError: true,
|
||||
expectedContains: ['some_unknown_status', 'job-unknown'],
|
||||
},
|
||||
];
|
||||
|
||||
it.each(testCases)('$name', ({ result, jobId, expectedError, expectedContains }) => {
|
||||
const output = handleErrorResult(result, jobId);
|
||||
expect(output.isError).toBe(expectedError);
|
||||
expectedContains.forEach((expected) => {
|
||||
expect(output.content[0].text).toContain(expected);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe('initiateCrawl', () => {
|
||||
beforeEach(() => {
|
||||
vi.stubGlobal('fetch', vi.fn());
|
||||
});
|
||||
|
||||
const testCases = [
|
||||
{
|
||||
name: 'initiates crawl successfully',
|
||||
accountId: 'acc-123',
|
||||
apiToken: 'token-abc',
|
||||
options: { url: 'https://example.com', formats: ['markdown'] },
|
||||
mockResponse: { success: true, result: { id: 'job-123' } },
|
||||
expectedJobId: 'job-123',
|
||||
},
|
||||
{
|
||||
name: 'throws on HTTP error',
|
||||
accountId: 'acc-123',
|
||||
apiToken: 'token-abc',
|
||||
options: { url: 'https://example.com' },
|
||||
mockResponse: null,
|
||||
mockStatus: 401,
|
||||
expectedError: 'Failed to initiate crawl: 401',
|
||||
},
|
||||
{
|
||||
name: 'throws on API failure',
|
||||
accountId: 'acc-123',
|
||||
apiToken: 'token-abc',
|
||||
options: { url: 'https://example.com' },
|
||||
mockResponse: { success: false, errors: [{ message: 'Invalid URL' }] },
|
||||
expectedError: 'Crawl initiation failed',
|
||||
},
|
||||
];
|
||||
|
||||
it.each(testCases)('$name', async ({ accountId, apiToken, options, mockResponse, mockStatus, expectedJobId, expectedError }) => {
|
||||
const fetchMock = vi.mocked(fetch);
|
||||
|
||||
if (expectedError) {
|
||||
if (mockStatus) {
|
||||
fetchMock.mockResolvedValueOnce(new Response('', { status: mockStatus }));
|
||||
} else {
|
||||
fetchMock.mockResolvedValueOnce(new Response(JSON.stringify(mockResponse), {
|
||||
status: 200,
|
||||
headers: { 'Content-Type': 'application/json' }
|
||||
}));
|
||||
}
|
||||
|
||||
await expect(initiateCrawl(accountId, apiToken, options)).rejects.toThrow(expectedError);
|
||||
} else {
|
||||
fetchMock.mockResolvedValueOnce(new Response(JSON.stringify(mockResponse), {
|
||||
status: 200,
|
||||
headers: { 'Content-Type': 'application/json' }
|
||||
}));
|
||||
|
||||
const result = await initiateCrawl(accountId, apiToken, options);
|
||||
expect(result).toBe(expectedJobId);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
describe('waitForCrawl', () => {
|
||||
beforeEach(() => {
|
||||
vi.stubGlobal('fetch', vi.fn());
|
||||
});
|
||||
|
||||
const testCases: Array<{
|
||||
name: string;
|
||||
accountId: string;
|
||||
apiToken: string;
|
||||
jobId: string;
|
||||
mockResponse: any;
|
||||
expectedStatus: string;
|
||||
}> = [
|
||||
{
|
||||
name: 'returns completed result immediately',
|
||||
accountId: 'acc-123',
|
||||
apiToken: 'token-abc',
|
||||
jobId: 'job-123',
|
||||
mockResponse: { result: { status: 'completed', total: 5, records: [] } },
|
||||
expectedStatus: 'completed',
|
||||
},
|
||||
{
|
||||
name: 'returns errored result',
|
||||
accountId: 'acc-123',
|
||||
apiToken: 'token-abc',
|
||||
jobId: 'job-123',
|
||||
mockResponse: { result: { status: 'errored', error: 'Something went wrong' } },
|
||||
expectedStatus: 'errored',
|
||||
},
|
||||
{
|
||||
name: 'returns cancelled_due_to_limits result',
|
||||
accountId: 'acc-123',
|
||||
apiToken: 'token-abc',
|
||||
jobId: 'job-123',
|
||||
mockResponse: { result: { status: 'cancelled_due_to_limits' } },
|
||||
expectedStatus: 'cancelled_due_to_limits',
|
||||
},
|
||||
];
|
||||
|
||||
it.each(testCases)('$name', async ({ accountId, apiToken, jobId, mockResponse, expectedStatus }) => {
|
||||
const fetchMock = vi.mocked(fetch);
|
||||
|
||||
fetchMock.mockResolvedValue(new Response(JSON.stringify(mockResponse), {
|
||||
status: 200,
|
||||
headers: { 'Content-Type': 'application/json' }
|
||||
}));
|
||||
|
||||
const result = await waitForCrawl(accountId, apiToken, jobId, 1, 1);
|
||||
expect(result.status).toBe(expectedStatus);
|
||||
});
|
||||
});
|
||||
+449
@@ -0,0 +1,449 @@
|
||||
import { Server } from "@modelcontextprotocol/sdk/server/index.js";
|
||||
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
|
||||
import {
|
||||
CallToolRequestSchema,
|
||||
ListToolsRequestSchema,
|
||||
} from "@modelcontextprotocol/sdk/types.js";
|
||||
|
||||
const API_BASE = "https://api.cloudflare.com/client/v4";
|
||||
|
||||
const MAX_RETRIES = 3;
|
||||
const RATE_LIMIT_DELAY_MS = 10000;
|
||||
|
||||
let lastRequestTime = 0;
|
||||
let requestCount = 0;
|
||||
let windowStart = Date.now();
|
||||
|
||||
interface CrawlOptions {
|
||||
url: string;
|
||||
limit?: number;
|
||||
depth?: number;
|
||||
formats?: string[];
|
||||
render?: boolean;
|
||||
maxAge?: number;
|
||||
source?: string;
|
||||
options?: {
|
||||
includeExternalLinks?: boolean;
|
||||
includeSubdomains?: boolean;
|
||||
includePatterns?: string[];
|
||||
excludePatterns?: string[];
|
||||
};
|
||||
}
|
||||
|
||||
function getEnv(key: string): string {
|
||||
const value = process.env[key];
|
||||
if (!value) {
|
||||
throw new Error(`Missing required environment variable: ${key}`);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
async function enforceRateLimit(): Promise<void> {
|
||||
const now = Date.now();
|
||||
const windowDuration = 60000;
|
||||
|
||||
if (now - windowStart >= windowDuration) {
|
||||
requestCount = 0;
|
||||
windowStart = now;
|
||||
}
|
||||
|
||||
const requestsPerMinute = parseInt(process.env.CF_RATE_LIMIT || "6", 10);
|
||||
|
||||
if (requestCount >= requestsPerMinute) {
|
||||
const waitTime = windowDuration - (now - windowStart);
|
||||
console.error(`Rate limit reached (${requestsPerMinute}/min). Waiting ${waitTime}ms...`);
|
||||
await new Promise((resolve) => setTimeout(resolve, waitTime));
|
||||
requestCount = 0;
|
||||
windowStart = Date.now();
|
||||
}
|
||||
|
||||
const timeSinceLastRequest = now - lastRequestTime;
|
||||
if (timeSinceLastRequest < RATE_LIMIT_DELAY_MS && requestCount > 0) {
|
||||
const waitTime = RATE_LIMIT_DELAY_MS - timeSinceLastRequest;
|
||||
await new Promise((resolve) => setTimeout(resolve, waitTime));
|
||||
}
|
||||
|
||||
lastRequestTime = Date.now();
|
||||
requestCount++;
|
||||
}
|
||||
|
||||
async function fetchWithRetry<T>(
|
||||
fn: () => Promise<T>,
|
||||
retries: number = MAX_RETRIES
|
||||
): Promise<T> {
|
||||
let lastError: Error | null = null;
|
||||
|
||||
for (let attempt = 0; attempt < retries; attempt++) {
|
||||
try {
|
||||
return await fn();
|
||||
} catch (error: any) {
|
||||
lastError = error;
|
||||
|
||||
const errorStr = error.message || "";
|
||||
const isRateLimit = errorStr.includes("429") ||
|
||||
errorStr.includes("Rate limit");
|
||||
|
||||
if (!isRateLimit || attempt === retries - 1) {
|
||||
throw error;
|
||||
}
|
||||
|
||||
const retryAfterMatch = errorStr.match(/Retry-After[:\s]*(\d+)/i);
|
||||
const delay = retryAfterMatch
|
||||
? parseInt(retryAfterMatch[1], 10) * 1000
|
||||
: Math.min(1000 * Math.pow(2, attempt), 30000);
|
||||
|
||||
console.error(`Rate limited. Retrying in ${delay}ms...`);
|
||||
await new Promise((resolve) => setTimeout(resolve, delay));
|
||||
}
|
||||
}
|
||||
|
||||
throw lastError;
|
||||
}
|
||||
|
||||
async function initiateCrawl(
|
||||
accountId: string,
|
||||
apiToken: string,
|
||||
options: CrawlOptions
|
||||
): Promise<string> {
|
||||
await enforceRateLimit();
|
||||
|
||||
return fetchWithRetry(async () => {
|
||||
const response = await fetch(
|
||||
`${API_BASE}/accounts/${accountId}/browser-rendering/crawl`,
|
||||
{
|
||||
method: "POST",
|
||||
headers: {
|
||||
Authorization: `Bearer ${apiToken}`,
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
url: options.url,
|
||||
limit: options.limit ?? 10,
|
||||
depth: options.depth ?? 1,
|
||||
formats: options.formats ?? ["markdown"],
|
||||
render: options.render ?? true,
|
||||
maxAge: options.maxAge,
|
||||
source: options.source ?? "all",
|
||||
options: options.options ?? {},
|
||||
}),
|
||||
}
|
||||
);
|
||||
|
||||
if (!response.ok) {
|
||||
const error = await response.text();
|
||||
const retryAfter = response.headers.get("Retry-After");
|
||||
const errorMsg = `Failed to initiate crawl: ${response.status} ${error}${retryAfter ? ` Retry-After: ${retryAfter}` : ""}`;
|
||||
throw new Error(errorMsg);
|
||||
}
|
||||
|
||||
const data = await response.json();
|
||||
if (!data.success) {
|
||||
throw new Error(`Crawl initiation failed: ${JSON.stringify(data.errors)}`);
|
||||
}
|
||||
|
||||
return data.result.id;
|
||||
});
|
||||
}
|
||||
|
||||
async function waitForCrawl(
|
||||
accountId: string,
|
||||
apiToken: string,
|
||||
jobId: string,
|
||||
maxAttempts: number = 120,
|
||||
delayMs: number = 5000
|
||||
): Promise<any> {
|
||||
for (let i = 0; i < maxAttempts; i++) {
|
||||
const response = await fetch(
|
||||
`${API_BASE}/accounts/${accountId}/browser-rendering/crawl/${jobId}?limit=1`,
|
||||
{
|
||||
headers: {
|
||||
Authorization: `Bearer ${apiToken}`,
|
||||
},
|
||||
}
|
||||
);
|
||||
|
||||
if (!response.ok) {
|
||||
const error = await response.text();
|
||||
throw new Error(`Failed to check crawl status: ${response.status} ${error}`);
|
||||
}
|
||||
|
||||
const data = await response.json();
|
||||
const status = data.result.status;
|
||||
|
||||
if (status !== "running") {
|
||||
return data.result;
|
||||
}
|
||||
|
||||
await new Promise((resolve) => setTimeout(resolve, delayMs));
|
||||
}
|
||||
|
||||
throw new Error("Crawl job did not complete within timeout");
|
||||
}
|
||||
|
||||
interface CrawlArgs {
|
||||
url: string;
|
||||
limit?: number;
|
||||
depth?: number;
|
||||
includeSubdomains?: boolean;
|
||||
includeExternalLinks?: boolean;
|
||||
includePatterns?: string[];
|
||||
excludePatterns?: string[];
|
||||
render?: boolean;
|
||||
}
|
||||
|
||||
function buildCrawlOptions(args: CrawlArgs, formats: string[]): CrawlOptions {
|
||||
return {
|
||||
url: args.url,
|
||||
limit: args.limit,
|
||||
depth: args.depth,
|
||||
formats,
|
||||
render: args.render,
|
||||
options: {
|
||||
includeExternalLinks: args.includeExternalLinks,
|
||||
includeSubdomains: args.includeSubdomains,
|
||||
includePatterns: args.includePatterns,
|
||||
excludePatterns: args.excludePatterns,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
function formatMarkdownResult(result: any): string {
|
||||
const records = result.records || [];
|
||||
const completedRecords = records.filter((r: any) => r.status === "completed");
|
||||
|
||||
const content = completedRecords
|
||||
.map((record: any) => {
|
||||
const title = record.metadata?.title || record.url;
|
||||
return `## ${title}\n\nURL: ${record.url}\n\n${record.markdown || ""}\n\n---\n`;
|
||||
})
|
||||
.join("\n");
|
||||
|
||||
return `Crawl completed: ${completedRecords.length} of ${result.total} pages crawled successfully.\n\n${content}`;
|
||||
}
|
||||
|
||||
function formatHtmlResult(result: any): string {
|
||||
const records = result.records || [];
|
||||
const completedRecords = records.filter((r: any) => r.status === "completed");
|
||||
|
||||
const content = completedRecords
|
||||
.map((record: any) => {
|
||||
const title = record.metadata?.title || record.url;
|
||||
return `<article>\n <h2>${title}</h2>\n <p>Source: <a href="${record.url}">${record.url}</a></p>\n <div class="content">${record.html || ""}</div>\n</article>\n`;
|
||||
})
|
||||
.join("\n");
|
||||
|
||||
return `Crawl completed: ${completedRecords.length} of ${result.total} pages crawled successfully.\n\n${content}`;
|
||||
}
|
||||
|
||||
function formatJsonResult(result: any): string {
|
||||
const records = result.records || [];
|
||||
const completedRecords = records.filter((r: any) => r.status === "completed");
|
||||
|
||||
const jsonOutput = {
|
||||
summary: {
|
||||
total: result.total,
|
||||
completed: completedRecords.length,
|
||||
status: result.status,
|
||||
},
|
||||
pages: completedRecords.map((record: any) => ({
|
||||
url: record.url,
|
||||
title: record.metadata?.title,
|
||||
status: record.metadata?.status,
|
||||
markdown: record.markdown,
|
||||
html: record.html,
|
||||
json: record.json,
|
||||
})),
|
||||
};
|
||||
|
||||
return JSON.stringify(jsonOutput, null, 2);
|
||||
}
|
||||
|
||||
function handleErrorResult(result: any, jobId: string): { content: any[]; isError: boolean } {
|
||||
const errorMessages: Record<string, string> = {
|
||||
errored: `Crawl job errored. Job ID: ${jobId}`,
|
||||
cancelled_due_to_timeout: `Crawl job cancelled due to timeout (7 days max). Job ID: ${jobId}`,
|
||||
cancelled_due_to_limits: `Crawl job cancelled due to account limits. Job ID: ${jobId}`,
|
||||
cancelled_by_user: `Crawl job was cancelled by user. Job ID: ${jobId}`,
|
||||
};
|
||||
|
||||
const message = errorMessages[result.status] || `Crawl job failed with status: ${result.status}. Job ID: ${jobId}`;
|
||||
|
||||
return {
|
||||
content: [{ type: "text" as const, text: message }],
|
||||
isError: true,
|
||||
};
|
||||
}
|
||||
|
||||
const server = new Server(
|
||||
{
|
||||
name: "cloudflare-crawl-mcp",
|
||||
version: "1.0.0",
|
||||
},
|
||||
{
|
||||
capabilities: {
|
||||
tools: {},
|
||||
},
|
||||
}
|
||||
);
|
||||
|
||||
const baseToolSchema = {
|
||||
type: "object" as const,
|
||||
properties: {
|
||||
url: {
|
||||
type: "string" as const,
|
||||
description: "The starting URL to crawl",
|
||||
},
|
||||
limit: {
|
||||
type: "number" as const,
|
||||
description: "Maximum number of pages to crawl (default: 10, max: 100000)",
|
||||
},
|
||||
depth: {
|
||||
type: "number" as const,
|
||||
description: "Maximum link depth to crawl from the starting URL (default: 1)",
|
||||
},
|
||||
includeSubdomains: {
|
||||
type: "boolean" as const,
|
||||
description: "If true, follows links to subdomains of the starting URL (default: false)",
|
||||
},
|
||||
includeExternalLinks: {
|
||||
type: "boolean" as const,
|
||||
description: "If true, follows links to external domains (default: false)",
|
||||
},
|
||||
includePatterns: {
|
||||
type: "array" as const,
|
||||
items: { type: "string" as const },
|
||||
description: "Only visits URLs that match one of these wildcard patterns",
|
||||
},
|
||||
excludePatterns: {
|
||||
type: "array" as const,
|
||||
items: { type: "string" as const },
|
||||
description: "Does not visit URLs that match any of these wildcard patterns",
|
||||
},
|
||||
render: {
|
||||
type: "boolean" as const,
|
||||
description: "If false, does a fast HTML fetch without executing JavaScript (default: true)",
|
||||
},
|
||||
},
|
||||
required: ["url"] as string[],
|
||||
};
|
||||
|
||||
const RATE_LIMIT_INFO = `
|
||||
---
|
||||
**Cloudflare Browser Rendering Limits:**
|
||||
|
||||
| Plan | Concurrent Browsers | Browser Time | REST API Rate |
|
||||
|------|---------------------|--------------|---------------|
|
||||
| Free | 3 | 10 min/day | 6 req/min |
|
||||
| Paid | 10 | 10 hours/month | 600 req/min |
|
||||
|
||||
**Environment Variables:**
|
||||
- CF_RATE_LIMIT: Override REST API requests per minute (default: 6 for Free, 600 for Paid)
|
||||
|
||||
**Tips:**
|
||||
- Use \`render: false\` for static content to avoid browser time usage
|
||||
- Use \`maxAge\` to cache results and reduce API calls
|
||||
- Set \`limit\` and \`depth\` appropriately to stay within limits
|
||||
---`;
|
||||
|
||||
server.setRequestHandler(ListToolsRequestSchema, async () => {
|
||||
return {
|
||||
tools: [
|
||||
{
|
||||
name: "crawl_url_markdown",
|
||||
description:
|
||||
`Crawl a website using Cloudflare Browser Rendering and return content in Markdown format. Supports following links across the site up to a configurable depth or page limit.${RATE_LIMIT_INFO}`,
|
||||
inputSchema: {
|
||||
...baseToolSchema,
|
||||
properties: {
|
||||
...baseToolSchema.properties,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "crawl_url_html",
|
||||
description:
|
||||
`Crawl a website using Cloudflare Browser Rendering and return content in HTML format. Supports following links across the site up to a configurable depth or page limit.${RATE_LIMIT_INFO}`,
|
||||
inputSchema: baseToolSchema,
|
||||
},
|
||||
{
|
||||
name: "crawl_url_json",
|
||||
description:
|
||||
`Crawl a website using Cloudflare Browser Rendering and return content in JSON format. This uses Workers AI for data extraction. Supports following links across the site up to a configurable depth or page limit.${RATE_LIMIT_INFO}`,
|
||||
inputSchema: baseToolSchema,
|
||||
},
|
||||
],
|
||||
};
|
||||
});
|
||||
|
||||
server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
||||
const { name, arguments: args } = request.params as { name: string; arguments: Record<string, unknown> };
|
||||
|
||||
const toolMatch = name.match(/^crawl_url_(markdown|html|json)$/);
|
||||
if (!toolMatch) {
|
||||
return {
|
||||
content: [{ type: "text", text: `Unknown tool: ${name}` }],
|
||||
isError: true,
|
||||
};
|
||||
}
|
||||
|
||||
const format = toolMatch[1] as "markdown" | "html" | "json";
|
||||
const formatMap: Record<string, string[]> = {
|
||||
markdown: ["markdown"],
|
||||
html: ["html"],
|
||||
json: ["json"],
|
||||
};
|
||||
const formats = formatMap[format];
|
||||
|
||||
try {
|
||||
const apiToken = getEnv("CF_API_TOKEN");
|
||||
const accountId = getEnv("CF_ACCOUNT_ID");
|
||||
|
||||
const crawlArgs: CrawlArgs = {
|
||||
url: args.url as string,
|
||||
limit: args.limit as number | undefined,
|
||||
depth: args.depth as number | undefined,
|
||||
includeSubdomains: args.includeSubdomains as boolean | undefined,
|
||||
includeExternalLinks: args.includeExternalLinks as boolean | undefined,
|
||||
includePatterns: args.includePatterns as string[] | undefined,
|
||||
excludePatterns: args.excludePatterns as string[] | undefined,
|
||||
render: args.render as boolean | undefined,
|
||||
};
|
||||
const options = buildCrawlOptions(crawlArgs, formats);
|
||||
|
||||
const jobId = await initiateCrawl(accountId, apiToken, options);
|
||||
const result = await waitForCrawl(accountId, apiToken, jobId);
|
||||
|
||||
const terminalStatuses = ["errored", "cancelled_due_to_timeout", "cancelled_due_to_limits", "cancelled_by_user"];
|
||||
if (terminalStatuses.includes(result.status)) {
|
||||
return handleErrorResult(result, jobId);
|
||||
}
|
||||
|
||||
const formatterMap: Record<string, (result: any) => string> = {
|
||||
markdown: formatMarkdownResult,
|
||||
html: formatHtmlResult,
|
||||
json: formatJsonResult,
|
||||
};
|
||||
|
||||
const formattedContent = formatterMap[format](result);
|
||||
|
||||
return {
|
||||
content: [{ type: "text", text: formattedContent }],
|
||||
};
|
||||
} catch (error) {
|
||||
const message = error instanceof Error ? error.message : String(error);
|
||||
return {
|
||||
content: [{ type: "text", text: `Error: ${message}` }],
|
||||
isError: true,
|
||||
};
|
||||
}
|
||||
});
|
||||
|
||||
async function main() {
|
||||
const transport = new StdioServerTransport();
|
||||
await server.connect(transport);
|
||||
}
|
||||
|
||||
main().catch((error) => {
|
||||
console.error("Server error:", error);
|
||||
process.exit(1);
|
||||
});
|
||||
@@ -0,0 +1,193 @@
|
||||
import { describe, it, expect, beforeAll } from 'vitest';
|
||||
|
||||
const API_BASE = "https://api.cloudflare.com/client/v4";
|
||||
|
||||
interface CrawlOptions {
|
||||
url: string;
|
||||
limit?: number;
|
||||
depth?: number;
|
||||
formats?: string[];
|
||||
render?: boolean;
|
||||
options?: {
|
||||
includeExternalLinks?: boolean;
|
||||
includeSubdomains?: boolean;
|
||||
includePatterns?: string[];
|
||||
excludePatterns?: string[];
|
||||
};
|
||||
}
|
||||
|
||||
async function initiateCrawl(
|
||||
accountId: string,
|
||||
apiToken: string,
|
||||
options: CrawlOptions
|
||||
): Promise<string> {
|
||||
const response = await fetch(
|
||||
`${API_BASE}/accounts/${accountId}/browser-rendering/crawl`,
|
||||
{
|
||||
method: "POST",
|
||||
headers: {
|
||||
Authorization: `Bearer ${apiToken}`,
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
url: options.url,
|
||||
limit: options.limit ?? 10,
|
||||
depth: options.depth ?? 1,
|
||||
formats: options.formats ?? ["markdown"],
|
||||
render: options.render ?? true,
|
||||
options: options.options ?? {},
|
||||
}),
|
||||
}
|
||||
);
|
||||
|
||||
if (!response.ok) {
|
||||
const error = await response.text();
|
||||
throw new Error(`Failed to initiate crawl: ${response.status} ${error}`);
|
||||
}
|
||||
|
||||
const data = await response.json();
|
||||
if (!data.success) {
|
||||
throw new Error(`Crawl initiation failed: ${JSON.stringify(data.errors)}`);
|
||||
}
|
||||
|
||||
return data.result.id;
|
||||
}
|
||||
|
||||
async function waitForCrawl(
|
||||
accountId: string,
|
||||
apiToken: string,
|
||||
jobId: string,
|
||||
maxAttempts: number = 60,
|
||||
delayMs: number = 5000
|
||||
): Promise<any> {
|
||||
for (let i = 0; i < maxAttempts; i++) {
|
||||
const response = await fetch(
|
||||
`${API_BASE}/accounts/${accountId}/browser-rendering/crawl/${jobId}?limit=1`,
|
||||
{
|
||||
headers: {
|
||||
Authorization: `Bearer ${apiToken}`,
|
||||
},
|
||||
}
|
||||
);
|
||||
|
||||
if (!response.ok) {
|
||||
const error = await response.text();
|
||||
throw new Error(`Failed to check crawl status: ${response.status} ${error}`);
|
||||
}
|
||||
|
||||
const data = await response.json();
|
||||
const status = data.result.status;
|
||||
|
||||
if (status !== "running") {
|
||||
return data.result;
|
||||
}
|
||||
|
||||
await new Promise((resolve) => setTimeout(resolve, delayMs));
|
||||
}
|
||||
|
||||
throw new Error("Crawl job did not complete within timeout");
|
||||
}
|
||||
|
||||
function getEnv(key: string): string {
|
||||
const value = process.env[key];
|
||||
if (!value) {
|
||||
throw new Error(`Missing required environment variable: ${key}`);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
describe('Integration: Cloudflare Crawl API', () => {
|
||||
const apiToken = process.env.CF_API_TOKEN;
|
||||
const accountId = process.env.CF_ACCOUNT_ID;
|
||||
|
||||
const hasCredentials = apiToken && accountId;
|
||||
|
||||
beforeAll(() => {
|
||||
if (!hasCredentials) {
|
||||
console.log('\n⚠️ Skipping integration tests - CF_API_TOKEN or CF_ACCOUNT_ID not set\n');
|
||||
}
|
||||
});
|
||||
|
||||
it.skipIf(!hasCredentials)('should crawl raczylo.com with multiple pages in markdown format', async () => {
|
||||
const accountId = getEnv("CF_ACCOUNT_ID");
|
||||
const apiToken = getEnv("CF_API_TOKEN");
|
||||
|
||||
try {
|
||||
const jobId = await initiateCrawl(accountId, apiToken, {
|
||||
url: "https://raczylo.com",
|
||||
limit: 5,
|
||||
depth: 2,
|
||||
formats: ["markdown"],
|
||||
});
|
||||
|
||||
console.log(` Started crawl job: ${jobId}`);
|
||||
|
||||
expect(jobId).toBeDefined();
|
||||
expect(typeof jobId).toBe("string");
|
||||
|
||||
const result = await waitForCrawl(accountId, apiToken, jobId, 60, 5000);
|
||||
|
||||
console.log(` Crawl status: ${result.status}`);
|
||||
console.log(` Total pages discovered: ${result.total}`);
|
||||
console.log(` Pages finished: ${result.finished}`);
|
||||
|
||||
expect(result.status).toBe("completed");
|
||||
expect(result.total).toBeGreaterThan(0);
|
||||
expect(result.records).toBeDefined();
|
||||
expect(Array.isArray(result.records)).toBe(true);
|
||||
expect(result.records.length).toBeGreaterThan(0);
|
||||
|
||||
const completedRecords = result.records.filter((r: any) => r.status === "completed");
|
||||
console.log(` Completed pages: ${completedRecords.length}`);
|
||||
|
||||
completedRecords.forEach((record: any, index: number) => {
|
||||
expect(record.url).toBeDefined();
|
||||
expect(record.markdown).toBeDefined();
|
||||
expect(record.markdown.length).toBeGreaterThan(0);
|
||||
console.log(` Page ${index + 1}: ${record.url} (${record.markdown.length} chars)`);
|
||||
});
|
||||
|
||||
const firstRecord = result.records[0];
|
||||
expect(firstRecord.markdown).toContain("#");
|
||||
|
||||
} catch (error: any) {
|
||||
if (error.message.includes("Rate limit")) {
|
||||
console.log(" ⚠️ Skipped - Rate limit exceeded");
|
||||
return;
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
}, 360000);
|
||||
});
|
||||
|
||||
describe('Environment Variable Validation', () => {
|
||||
const testCases = [
|
||||
{
|
||||
name: 'CF_API_TOKEN is required',
|
||||
envKey: 'CF_API_TOKEN',
|
||||
expectedError: 'Missing required environment variable: CF_API_TOKEN',
|
||||
},
|
||||
{
|
||||
name: 'CF_ACCOUNT_ID is required',
|
||||
envKey: 'CF_ACCOUNT_ID',
|
||||
expectedError: 'Missing required environment variable: CF_ACCOUNT_ID',
|
||||
},
|
||||
];
|
||||
|
||||
it.each(testCases)('$name', ({ envKey, expectedError }) => {
|
||||
delete process.env[envKey];
|
||||
expect(() => getEnv(envKey)).toThrow(expectedError);
|
||||
});
|
||||
|
||||
it('should return value when CF_API_TOKEN is set', () => {
|
||||
process.env.CF_API_TOKEN = 'test-token';
|
||||
expect(getEnv('CF_API_TOKEN')).toBe('test-token');
|
||||
delete process.env.CF_API_TOKEN;
|
||||
});
|
||||
|
||||
it('should return value when CF_ACCOUNT_ID is set', () => {
|
||||
process.env.CF_ACCOUNT_ID = 'test-account';
|
||||
expect(getEnv('CF_ACCOUNT_ID')).toBe('test-account');
|
||||
delete process.env.CF_ACCOUNT_ID;
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,14 @@
|
||||
{
|
||||
"compilerOptions": {
|
||||
"target": "ES2022",
|
||||
"module": "ESNext",
|
||||
"moduleResolution": "bundler",
|
||||
"esModuleInterop": true,
|
||||
"strict": true,
|
||||
"outDir": "dist",
|
||||
"rootDir": "src",
|
||||
"declaration": true,
|
||||
"skipLibCheck": true
|
||||
},
|
||||
"include": ["src/**/*"]
|
||||
}
|
||||
@@ -0,0 +1,8 @@
|
||||
import { defineConfig } from 'vitest/config';
|
||||
|
||||
export default defineConfig({
|
||||
test: {
|
||||
include: ['src/**/*.test.ts'],
|
||||
exclude: ['dist/**'],
|
||||
},
|
||||
});
|
||||
Reference in New Issue
Block a user