Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 9 additions & 10 deletions packages/@nitpicker/crawler/src/crawler/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import { crawlerLog } from '../debug.js';

import { detectPaginationPattern } from './detect-pagination-pattern.js';
import { fetchDestination } from './fetch-destination.js';
import { formatCrawlProgress } from './format-crawl-progress.js';
import { generatePredictedUrls } from './generate-predicted-urls.js';
import { handleIgnoreAndSkip } from './handle-ignore-and-skip.js';
import { handleResourceResponse } from './handle-resource-response.js';
Expand Down Expand Up @@ -579,16 +580,14 @@ export default class Crawler extends EventEmitter<CrawlerEventTypes> {
interval: this.#options.interval,
verbose: this.#options.verbose || !process.stdout.isTTY,
header: (_progress, done, total, limit) => {
const allDone = done + resumeOffset;
const allTotal = total + resumeOffset;
const extTotal = externalUrls.size;
const extDone = externalDoneUrls.size;
const pct = allTotal > 0 ? Math.round((allDone / allTotal) * 100) : 0;
return (
c.bold(`Crawling: ${allDone - extDone}/${allTotal - extTotal}`) +
c.dim(`(${extDone}/${extTotal})`) +
c.bold(` (${pct}%) [${limit} parallel]`)
);
return formatCrawlProgress({
done,
total,
resumeOffset,
externalTotal: externalUrls.size,
externalDone: externalDoneUrls.size,
limit,
});
},
onPush: (url) => {
const key = protocolAgnosticKey(url.withoutHashAndAuth);
Expand Down
150 changes: 150 additions & 0 deletions packages/@nitpicker/crawler/src/crawler/format-crawl-progress.spec.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
import c from 'ansi-colors';
import { describe, it, expect, beforeAll, afterAll } from 'vitest';

import { formatCrawlProgress } from './format-crawl-progress.js';

describe('formatCrawlProgress', () => {
const originalEnabled = c.enabled;

beforeAll(() => {
c.enabled = false;
});

afterAll(() => {
c.enabled = originalEnabled;
});

it('shows done, found, remaining for internal pages', () => {
const result = formatCrawlProgress({
done: 50,
total: 100,
resumeOffset: 0,
externalTotal: 0,
externalDone: 0,
limit: 10,
});
expect(result).toContain('50 done / 100 found');
expect(result).toContain('50 remaining');
});

it('includes external page counts', () => {
const result = formatCrawlProgress({
done: 60,
total: 120,
resumeOffset: 0,
externalTotal: 20,
externalDone: 10,
limit: 5,
});
expect(result).toContain('50 done / 100 found');
expect(result).toContain('+10/20 ext');
expect(result).toContain('60 remaining');
});

it('includes resumeOffset in done and total counts', () => {
const result = formatCrawlProgress({
done: 30,
total: 50,
resumeOffset: 100,
externalTotal: 0,
externalDone: 0,
limit: 10,
});
expect(result).toContain('130 done / 150 found');
expect(result).toContain('20 remaining');
});

it('shows parallel count', () => {
const result = formatCrawlProgress({
done: 10,
total: 20,
resumeOffset: 0,
externalTotal: 0,
externalDone: 0,
limit: 8,
});
expect(result).toContain('8 parallel');
});

it('handles zero total', () => {
const result = formatCrawlProgress({
done: 0,
total: 0,
resumeOffset: 0,
externalTotal: 0,
externalDone: 0,
limit: 10,
});
expect(result).toContain('0 done / 0 found');
expect(result).toContain('0 remaining');
});

it('calculates remaining correctly with both internal and external', () => {
const result = formatCrawlProgress({
done: 80,
total: 200,
resumeOffset: 0,
externalTotal: 50,
externalDone: 30,
limit: 10,
});
// internal remaining: (200-50) - (80-30) = 150 - 50 = 100
// external remaining: 50 - 30 = 20
// total remaining: 120
expect(result).toContain('120 remaining');
});

it('shows percentage', () => {
const result = formatCrawlProgress({
done: 50,
total: 100,
resumeOffset: 0,
externalTotal: 0,
externalDone: 0,
limit: 10,
});
expect(result).toContain('(50%)');
});

it('shows 0% when total is zero', () => {
const result = formatCrawlProgress({
done: 0,
total: 0,
resumeOffset: 0,
externalTotal: 0,
externalDone: 0,
limit: 10,
});
expect(result).toContain('(0%)');
});

it('produces exact expected format', () => {
const result = formatCrawlProgress({
done: 50,
total: 100,
resumeOffset: 0,
externalTotal: 0,
externalDone: 0,
limit: 10,
});
expect(result).toBe(
'Crawling: 50 done / 100 found (+0/0 ext) (50%) [50 remaining] [10 parallel]',
);
});

it('combines resumeOffset with external URLs correctly', () => {
const result = formatCrawlProgress({
done: 40,
total: 80,
resumeOffset: 20,
externalTotal: 10,
externalDone: 5,
limit: 5,
});
// allDone=60, allTotal=100, internalDone=55, internalTotal=90
// internalRemaining=35, externalRemaining=5, totalRemaining=40
expect(result).toContain('55 done / 90 found');
expect(result).toContain('+5/10 ext');
expect(result).toContain('40 remaining');
});
});
58 changes: 58 additions & 0 deletions packages/@nitpicker/crawler/src/crawler/format-crawl-progress.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import c from 'ansi-colors';

/**
* Parameters for formatting crawl progress display.
*/
interface FormatCrawlProgressParams {
/** Number of URLs completed by the deal queue */
readonly done: number;
/** Total number of URLs in the deal queue (including completed) */
readonly total: number;
/** Offset from a previous resumed session */
readonly resumeOffset: number;
/** Number of external URLs discovered */
readonly externalTotal: number;
/** Number of external URLs completed */
readonly externalDone: number;
/** Number of parallel workers */
readonly limit: number;
}

/**
* Formats the crawl progress header for the deal() progress display.
*
* Shows "done / found (remaining)" format instead of "done/total"
* to make it clearer that the total is expected to grow during crawling.
* @param params - The crawl progress parameters.
* @param params.done - Number of URLs completed by the deal queue.
* @param params.total - Total number of URLs in the deal queue (including completed).
* @param params.resumeOffset - Offset from a previous resumed session.
* @param params.externalTotal - Number of external URLs discovered.
* @param params.externalDone - Number of external URLs completed.
* @param params.limit - Number of parallel workers.
* @returns The formatted progress string with ANSI color codes.
*/
export function formatCrawlProgress({
done,
total,
resumeOffset,
externalTotal,
externalDone,
limit,
}: FormatCrawlProgressParams): string {
const allDone = done + resumeOffset;
const allTotal = total + resumeOffset;
const internalDone = allDone - externalDone;
const internalTotal = allTotal - externalTotal;
const internalRemaining = internalTotal - internalDone;
const externalRemaining = externalTotal - externalDone;
const totalRemaining = internalRemaining + externalRemaining;
const pct = allTotal > 0 ? Math.round((allDone / allTotal) * 100) : 0;

return (
c.bold(`Crawling: ${internalDone} done / ${internalTotal} found`) +
c.dim(` (+${externalDone}/${externalTotal} ext)`) +
c.bold(` (${pct}%) [${totalRemaining} remaining]`) +
c.dim(` [${limit} parallel]`)
);
}
Loading