Files
cloudflare-crawl-mcp/dist/index.test.js
T
2026-03-11 19:24:29 +00:00

629 lines
22 KiB
JavaScript

import { describe, it, expect, beforeEach, vi } from 'vitest';
const API_BASE = "https://api.cloudflare.com/client/v4";
function getEnv(key) {
const value = process.env[key];
if (!value) {
throw new Error(`Missing required environment variable: ${key}`);
}
return value;
}
async function initiateCrawl(accountId, apiToken, options) {
const response = await fetch(`${API_BASE}/accounts/${accountId}/browser-rendering/crawl`, {
method: "POST",
headers: {
Authorization: `Bearer ${apiToken}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
url: options.url,
limit: options.limit ?? 10,
depth: options.depth ?? 1,
formats: options.formats ?? ["markdown"],
render: options.render ?? true,
maxAge: options.maxAge,
source: options.source ?? "all",
options: options.options ?? {},
}),
});
if (!response.ok) {
const error = await response.text();
throw new Error(`Failed to initiate crawl: ${response.status} ${error}`);
}
const data = await response.json();
if (!data.success) {
throw new Error(`Crawl initiation failed: ${JSON.stringify(data.errors)}`);
}
return data.result.id;
}
async function waitForCrawl(accountId, apiToken, jobId, maxAttempts = 120, delayMs = 5000) {
for (let i = 0; i < maxAttempts; i++) {
const response = await fetch(`${API_BASE}/accounts/${accountId}/browser-rendering/crawl/${jobId}?limit=1`, {
headers: {
Authorization: `Bearer ${apiToken}`,
},
});
if (!response.ok) {
const error = await response.text();
throw new Error(`Failed to check crawl status: ${response.status} ${error}`);
}
const data = await response.json();
const status = data.result.status;
if (status !== "running") {
return data.result;
}
await new Promise((resolve) => setTimeout(resolve, delayMs));
}
throw new Error("Crawl job did not complete within timeout");
}
function buildCrawlOptions(args, formats) {
return {
url: args.url,
limit: args.limit,
depth: args.depth,
formats,
render: args.render,
options: {
includeExternalLinks: args.includeExternalLinks,
includeSubdomains: args.includeSubdomains,
includePatterns: args.includePatterns,
excludePatterns: args.excludePatterns,
},
};
}
function formatMarkdownResult(result) {
const records = result.records || [];
const completedRecords = records.filter((r) => r.status === "completed");
const content = completedRecords
.map((record) => {
const title = record.metadata?.title || record.url;
return `## ${title}\n\nURL: ${record.url}\n\n${record.markdown || ""}\n\n---\n`;
})
.join("\n");
return `Crawl completed: ${completedRecords.length} of ${result.total} pages crawled successfully.\n\n${content}`;
}
function formatHtmlResult(result) {
const records = result.records || [];
const completedRecords = records.filter((r) => r.status === "completed");
const content = completedRecords
.map((record) => {
const title = record.metadata?.title || record.url;
return `<article>\n <h2>${title}</h2>\n <p>Source: <a href="${record.url}">${record.url}</a></p>\n <div class="content">${record.html || ""}</div>\n</article>\n`;
})
.join("\n");
return `Crawl completed: ${completedRecords.length} of ${result.total} pages crawled successfully.\n\n${content}`;
}
function formatJsonResult(result) {
const records = result.records || [];
const completedRecords = records.filter((r) => r.status === "completed");
const jsonOutput = {
summary: {
total: result.total,
completed: completedRecords.length,
status: result.status,
},
pages: completedRecords.map((record) => ({
url: record.url,
title: record.metadata?.title,
status: record.metadata?.status,
markdown: record.markdown,
html: record.html,
json: record.json,
})),
};
return JSON.stringify(jsonOutput, null, 2);
}
function handleErrorResult(result, jobId) {
const errorMessages = {
errored: `Crawl job errored. Job ID: ${jobId}`,
cancelled_due_to_timeout: `Crawl job cancelled due to timeout (7 days max). Job ID: ${jobId}`,
cancelled_due_to_limits: `Crawl job cancelled due to account limits. Job ID: ${jobId}`,
cancelled_by_user: `Crawl job was cancelled by user. Job ID: ${jobId}`,
};
const message = errorMessages[result.status] || `Crawl job failed with status: ${result.status}. Job ID: ${jobId}`;
return {
content: [{ type: "text", text: message }],
isError: true,
};
}
describe('getEnv', () => {
const testCases = [
{
name: 'returns value when env var exists',
envKey: 'TEST_VAR',
envValue: 'test-value',
expected: 'test-value',
},
{
name: 'throws when env var is empty string',
envKey: 'EMPTY_VAR',
envValue: '',
expectedError: 'Missing required environment variable: EMPTY_VAR',
},
{
name: 'throws when env var is undefined',
envKey: 'UNDEFINED_VAR',
envValue: undefined,
expectedError: 'Missing required environment variable: UNDEFINED_VAR',
},
];
it.each(testCases)('$name', ({ envKey, envValue, expected, expectedError }) => {
if (expectedError) {
if (envValue === undefined) {
delete process.env[envKey];
}
else {
process.env[envKey] = envValue;
}
expect(() => getEnv(envKey)).toThrow(expectedError);
}
else {
process.env[envKey] = envValue;
expect(getEnv(envKey)).toBe(expected);
}
});
});
describe('buildCrawlOptions', () => {
const testCases = [
{
name: 'builds options with markdown format',
args: { url: 'https://example.com' },
formats: ['markdown'],
expected: {
url: 'https://example.com',
limit: undefined,
depth: undefined,
formats: ['markdown'],
render: undefined,
options: {
includeExternalLinks: undefined,
includeSubdomains: undefined,
includePatterns: undefined,
excludePatterns: undefined,
},
},
},
{
name: 'builds options with all parameters',
args: {
url: 'https://example.com',
limit: 50,
depth: 2,
includeSubdomains: true,
includeExternalLinks: false,
includePatterns: ['**/docs/**'],
excludePatterns: ['**/archive/**'],
render: true,
},
formats: ['html'],
expected: {
url: 'https://example.com',
limit: 50,
depth: 2,
formats: ['html'],
render: true,
options: {
includeExternalLinks: false,
includeSubdomains: true,
includePatterns: ['**/docs/**'],
excludePatterns: ['**/archive/**'],
},
},
},
{
name: 'builds options with json format',
args: { url: 'https://api.example.com', limit: 100 },
formats: ['json'],
expected: {
url: 'https://api.example.com',
limit: 100,
formats: ['json'],
depth: undefined,
render: undefined,
options: {
includeExternalLinks: undefined,
includeSubdomains: undefined,
includePatterns: undefined,
excludePatterns: undefined,
},
},
},
{
name: 'handles empty options object',
args: { url: 'https://test.com' },
formats: ['markdown'],
expected: {
url: 'https://test.com',
formats: ['markdown'],
options: {
includeExternalLinks: undefined,
includeSubdomains: undefined,
includePatterns: undefined,
excludePatterns: undefined,
},
},
},
];
it.each(testCases)('$name', ({ args, formats, expected }) => {
const result = buildCrawlOptions(args, formats);
expect(result).toEqual(expected);
});
});
describe('formatMarkdownResult', () => {
const testCases = [
{
name: 'formats single completed page',
result: {
total: 1,
status: 'completed',
records: [
{
url: 'https://example.com',
status: 'completed',
markdown: '# Hello World',
metadata: { title: 'Home Page', status: 200 },
},
],
},
expectedContains: ['## Home Page', '# Hello World', 'Crawl completed: 1 of 1'],
},
{
name: 'formats multiple completed pages',
result: {
total: 2,
status: 'completed',
records: [
{
url: 'https://example.com',
status: 'completed',
markdown: '# Page 1',
metadata: { title: 'Page One', status: 200 },
},
{
url: 'https://example.com/about',
status: 'completed',
markdown: '# About Us',
metadata: { title: 'About', status: 200 },
},
],
},
expectedContains: ['## Page One', '## About', 'Crawl completed: 2 of 2'],
},
{
name: 'handles missing markdown content',
result: {
total: 1,
status: 'completed',
records: [
{
url: 'https://example.com',
status: 'completed',
markdown: '',
metadata: { title: 'Test', status: 200 },
},
],
},
expectedContains: ['## Test', 'URL: https://example.com'],
},
{
name: 'uses url as title when metadata.title is missing',
result: {
total: 1,
status: 'completed',
records: [
{
url: 'https://example.com/unnamed',
status: 'completed',
markdown: 'Content here',
},
],
},
expectedContains: ['## https://example.com/unnamed', 'Content here'],
},
{
name: 'handles empty records array',
result: {
total: 0,
status: 'completed',
records: [],
},
expectedContains: ['Crawl completed: 0 of 0'],
},
{
name: 'filters out non-completed records',
result: {
total: 3,
status: 'completed',
records: [
{ url: 'https://example.com/1', status: 'completed', markdown: '# Done' },
{ url: 'https://example.com/2', status: 'errored', markdown: '# Failed' },
{ url: 'https://example.com/3', status: 'skipped' },
],
},
expectedContains: ['Crawl completed: 1 of 3', '# Done'],
},
];
it.each(testCases)('$name', ({ result, expectedContains }) => {
const output = formatMarkdownResult(result);
expectedContains.forEach((expected) => {
expect(output).toContain(expected);
});
});
});
describe('formatHtmlResult', () => {
const testCases = [
{
name: 'formats single completed page with HTML',
result: {
total: 1,
status: 'completed',
records: [
{
url: 'https://example.com',
status: 'completed',
html: '<p>Hello World</p>',
metadata: { title: 'Home Page', status: 200 },
},
],
},
expectedContains: ['<h2>Home Page</h2>', '<p>Hello World</p>', 'Crawl completed: 1 of 1'],
},
{
name: 'formats multiple completed pages',
result: {
total: 2,
status: 'completed',
records: [
{
url: 'https://example.com',
status: 'completed',
html: '<div>Page 1</div>',
metadata: { title: 'Page One', status: 200 },
},
{
url: 'https://example.com/about',
status: 'completed',
html: '<div>About Us</div>',
metadata: { title: 'About', status: 200 },
},
],
},
expectedContains: ['<h2>Page One</h2>', '<h2>About</h2>', 'Crawl completed: 2 of 2'],
},
{
name: 'handles missing HTML content',
result: {
total: 1,
status: 'completed',
records: [
{
url: 'https://example.com',
status: 'completed',
html: '',
metadata: { title: 'Test', status: 200 },
},
],
},
expectedContains: ['<h2>Test</h2>', '<a href="https://example.com">'],
},
];
it.each(testCases)('$name', ({ result, expectedContains }) => {
const output = formatHtmlResult(result);
expectedContains.forEach((expected) => {
expect(output).toContain(expected);
});
});
});
describe('formatJsonResult', () => {
const testCases = [
{
name: 'formats single completed page as JSON',
result: {
total: 1,
status: 'completed',
records: [
{
url: 'https://example.com',
status: 'completed',
markdown: '# Hello',
html: '<h1>Hello</h1>',
json: { key: 'value' },
metadata: { title: 'Home', status: 200 },
},
],
},
},
{
name: 'formats multiple completed pages as JSON',
result: {
total: 2,
status: 'completed',
records: [
{
url: 'https://example.com/page1',
status: 'completed',
markdown: '# Page 1',
},
{
url: 'https://example.com/page2',
status: 'completed',
markdown: '# Page 2',
},
],
},
},
{
name: 'includes summary with correct counts',
result: {
total: 5,
status: 'completed',
records: [
{ url: 'https://example.com/1', status: 'completed' },
{ url: 'https://example.com/2', status: 'completed' },
{ url: 'https://example.com/3', status: 'errored' },
{ url: 'https://example.com/4', status: 'skipped' },
{ url: 'https://example.com/5', status: 'completed' },
],
},
},
{
name: 'handles empty records',
result: {
total: 0,
status: 'completed',
records: [],
},
},
];
it.each(testCases)('$name', ({ result }) => {
const output = formatJsonResult(result);
const parsed = JSON.parse(output);
expect(parsed).toHaveProperty('summary');
expect(parsed).toHaveProperty('pages');
const completedCount = result.records.filter((r) => r.status === 'completed').length;
expect(parsed.summary.completed).toBe(completedCount);
expect(parsed.summary.total).toBe(result.total);
expect(parsed.summary.status).toBe(result.status);
});
});
describe('handleErrorResult', () => {
const testCases = [
{
name: 'handles errored status',
result: { status: 'errored' },
jobId: 'test-job-123',
expectedError: true,
expectedContains: ['errored', 'test-job-123'],
},
{
name: 'handles cancelled_due_to_timeout status',
result: { status: 'cancelled_due_to_timeout' },
jobId: 'job-456',
expectedError: true,
expectedContains: ['timeout', 'job-456'],
},
{
name: 'handles cancelled_due_to_limits status',
result: { status: 'cancelled_due_to_limits' },
jobId: 'job-789',
expectedError: true,
expectedContains: ['limits', 'job-789'],
},
{
name: 'handles cancelled_by_user status',
result: { status: 'cancelled_by_user' },
jobId: 'job-000',
expectedError: true,
expectedContains: ['cancelled by user', 'job-000'],
},
{
name: 'handles unknown status',
result: { status: 'some_unknown_status' },
jobId: 'job-unknown',
expectedError: true,
expectedContains: ['some_unknown_status', 'job-unknown'],
},
];
it.each(testCases)('$name', ({ result, jobId, expectedError, expectedContains }) => {
const output = handleErrorResult(result, jobId);
expect(output.isError).toBe(expectedError);
expectedContains.forEach((expected) => {
expect(output.content[0].text).toContain(expected);
});
});
});
describe('initiateCrawl', () => {
beforeEach(() => {
vi.stubGlobal('fetch', vi.fn());
});
const testCases = [
{
name: 'initiates crawl successfully',
accountId: 'acc-123',
apiToken: 'token-abc',
options: { url: 'https://example.com', formats: ['markdown'] },
mockResponse: { success: true, result: { id: 'job-123' } },
expectedJobId: 'job-123',
},
{
name: 'throws on HTTP error',
accountId: 'acc-123',
apiToken: 'token-abc',
options: { url: 'https://example.com' },
mockResponse: null,
mockStatus: 401,
expectedError: 'Failed to initiate crawl: 401',
},
{
name: 'throws on API failure',
accountId: 'acc-123',
apiToken: 'token-abc',
options: { url: 'https://example.com' },
mockResponse: { success: false, errors: [{ message: 'Invalid URL' }] },
expectedError: 'Crawl initiation failed',
},
];
it.each(testCases)('$name', async ({ accountId, apiToken, options, mockResponse, mockStatus, expectedJobId, expectedError }) => {
const fetchMock = vi.mocked(fetch);
if (expectedError) {
if (mockStatus) {
fetchMock.mockResolvedValueOnce(new Response('', { status: mockStatus }));
}
else {
fetchMock.mockResolvedValueOnce(new Response(JSON.stringify(mockResponse), {
status: 200,
headers: { 'Content-Type': 'application/json' }
}));
}
await expect(initiateCrawl(accountId, apiToken, options)).rejects.toThrow(expectedError);
}
else {
fetchMock.mockResolvedValueOnce(new Response(JSON.stringify(mockResponse), {
status: 200,
headers: { 'Content-Type': 'application/json' }
}));
const result = await initiateCrawl(accountId, apiToken, options);
expect(result).toBe(expectedJobId);
}
});
});
describe('waitForCrawl', () => {
beforeEach(() => {
vi.stubGlobal('fetch', vi.fn());
});
const testCases = [
{
name: 'returns completed result immediately',
accountId: 'acc-123',
apiToken: 'token-abc',
jobId: 'job-123',
mockResponse: { result: { status: 'completed', total: 5, records: [] } },
expectedStatus: 'completed',
},
{
name: 'returns errored result',
accountId: 'acc-123',
apiToken: 'token-abc',
jobId: 'job-123',
mockResponse: { result: { status: 'errored', error: 'Something went wrong' } },
expectedStatus: 'errored',
},
{
name: 'returns cancelled_due_to_limits result',
accountId: 'acc-123',
apiToken: 'token-abc',
jobId: 'job-123',
mockResponse: { result: { status: 'cancelled_due_to_limits' } },
expectedStatus: 'cancelled_due_to_limits',
},
];
it.each(testCases)('$name', async ({ accountId, apiToken, jobId, mockResponse, expectedStatus }) => {
const fetchMock = vi.mocked(fetch);
fetchMock.mockResolvedValue(new Response(JSON.stringify(mockResponse), {
status: 200,
headers: { 'Content-Type': 'application/json' }
}));
const result = await waitForCrawl(accountId, apiToken, jobId, 1, 1);
expect(result.status).toBe(expectedStatus);
});
});