diff --git a/packages/@nitpicker/crawler/src/crawler/crawler.ts b/packages/@nitpicker/crawler/src/crawler/crawler.ts index ef19a36..29fbb71 100644 --- a/packages/@nitpicker/crawler/src/crawler/crawler.ts +++ b/packages/@nitpicker/crawler/src/crawler/crawler.ts @@ -22,6 +22,7 @@ import { crawlerLog } from '../debug.js'; import { detectPaginationPattern } from './detect-pagination-pattern.js'; import { fetchDestination } from './fetch-destination.js'; +import { formatCrawlProgress } from './format-crawl-progress.js'; import { generatePredictedUrls } from './generate-predicted-urls.js'; import { handleIgnoreAndSkip } from './handle-ignore-and-skip.js'; import { handleResourceResponse } from './handle-resource-response.js'; @@ -579,16 +580,14 @@ export default class Crawler extends EventEmitter { interval: this.#options.interval, verbose: this.#options.verbose || !process.stdout.isTTY, header: (_progress, done, total, limit) => { - const allDone = done + resumeOffset; - const allTotal = total + resumeOffset; - const extTotal = externalUrls.size; - const extDone = externalDoneUrls.size; - const pct = allTotal > 0 ? Math.round((allDone / allTotal) * 100) : 0; - return ( - c.bold(`Crawling: ${allDone - extDone}/${allTotal - extTotal}`) + - c.dim(`(${extDone}/${extTotal})`) + - c.bold(` (${pct}%) [${limit} parallel]`) - ); + return formatCrawlProgress({ + done, + total, + resumeOffset, + externalTotal: externalUrls.size, + externalDone: externalDoneUrls.size, + limit, + }); }, onPush: (url) => { const key = protocolAgnosticKey(url.withoutHashAndAuth); diff --git a/packages/@nitpicker/crawler/src/crawler/format-crawl-progress.spec.ts b/packages/@nitpicker/crawler/src/crawler/format-crawl-progress.spec.ts new file mode 100644 index 0000000..929ab63 --- /dev/null +++ b/packages/@nitpicker/crawler/src/crawler/format-crawl-progress.spec.ts @@ -0,0 +1,150 @@ +import c from 'ansi-colors'; +import { describe, it, expect, beforeAll, afterAll } from 'vitest'; + +import { formatCrawlProgress } from './format-crawl-progress.js'; + +describe('formatCrawlProgress', () => { + const originalEnabled = c.enabled; + + beforeAll(() => { + c.enabled = false; + }); + + afterAll(() => { + c.enabled = originalEnabled; + }); + + it('shows done, found, remaining for internal pages', () => { + const result = formatCrawlProgress({ + done: 50, + total: 100, + resumeOffset: 0, + externalTotal: 0, + externalDone: 0, + limit: 10, + }); + expect(result).toContain('50 done / 100 found'); + expect(result).toContain('50 remaining'); + }); + + it('includes external page counts', () => { + const result = formatCrawlProgress({ + done: 60, + total: 120, + resumeOffset: 0, + externalTotal: 20, + externalDone: 10, + limit: 5, + }); + expect(result).toContain('50 done / 100 found'); + expect(result).toContain('+10/20 ext'); + expect(result).toContain('60 remaining'); + }); + + it('includes resumeOffset in done and total counts', () => { + const result = formatCrawlProgress({ + done: 30, + total: 50, + resumeOffset: 100, + externalTotal: 0, + externalDone: 0, + limit: 10, + }); + expect(result).toContain('130 done / 150 found'); + expect(result).toContain('20 remaining'); + }); + + it('shows parallel count', () => { + const result = formatCrawlProgress({ + done: 10, + total: 20, + resumeOffset: 0, + externalTotal: 0, + externalDone: 0, + limit: 8, + }); + expect(result).toContain('8 parallel'); + }); + + it('handles zero total', () => { + const result = formatCrawlProgress({ + done: 0, + total: 0, + resumeOffset: 0, + externalTotal: 0, + externalDone: 0, + limit: 10, + }); + expect(result).toContain('0 done / 0 found'); + expect(result).toContain('0 remaining'); + }); + + it('calculates remaining correctly with both internal and external', () => { + const result = formatCrawlProgress({ + done: 80, + total: 200, + resumeOffset: 0, + externalTotal: 50, + externalDone: 30, + limit: 10, + }); + // internal remaining: (200-50) - (80-30) = 150 - 50 = 100 + // external remaining: 50 - 30 = 20 + // total remaining: 120 + expect(result).toContain('120 remaining'); + }); + + it('shows percentage', () => { + const result = formatCrawlProgress({ + done: 50, + total: 100, + resumeOffset: 0, + externalTotal: 0, + externalDone: 0, + limit: 10, + }); + expect(result).toContain('(50%)'); + }); + + it('shows 0% when total is zero', () => { + const result = formatCrawlProgress({ + done: 0, + total: 0, + resumeOffset: 0, + externalTotal: 0, + externalDone: 0, + limit: 10, + }); + expect(result).toContain('(0%)'); + }); + + it('produces exact expected format', () => { + const result = formatCrawlProgress({ + done: 50, + total: 100, + resumeOffset: 0, + externalTotal: 0, + externalDone: 0, + limit: 10, + }); + expect(result).toBe( + 'Crawling: 50 done / 100 found (+0/0 ext) (50%) [50 remaining] [10 parallel]', + ); + }); + + it('combines resumeOffset with external URLs correctly', () => { + const result = formatCrawlProgress({ + done: 40, + total: 80, + resumeOffset: 20, + externalTotal: 10, + externalDone: 5, + limit: 5, + }); + // allDone=60, allTotal=100, internalDone=55, internalTotal=90 + // internalRemaining=35, externalRemaining=5, totalRemaining=40 + expect(result).toContain('55 done / 90 found'); + expect(result).toContain('+5/10 ext'); + expect(result).toContain('40 remaining'); + }); +}); diff --git a/packages/@nitpicker/crawler/src/crawler/format-crawl-progress.ts b/packages/@nitpicker/crawler/src/crawler/format-crawl-progress.ts new file mode 100644 index 0000000..d41cc1f --- /dev/null +++ b/packages/@nitpicker/crawler/src/crawler/format-crawl-progress.ts @@ -0,0 +1,58 @@ +import c from 'ansi-colors'; + +/** + * Parameters for formatting crawl progress display. + */ +interface FormatCrawlProgressParams { + /** Number of URLs completed by the deal queue */ + readonly done: number; + /** Total number of URLs in the deal queue (including completed) */ + readonly total: number; + /** Offset from a previous resumed session */ + readonly resumeOffset: number; + /** Number of external URLs discovered */ + readonly externalTotal: number; + /** Number of external URLs completed */ + readonly externalDone: number; + /** Number of parallel workers */ + readonly limit: number; +} + +/** + * Formats the crawl progress header for the deal() progress display. + * + * Shows "done / found (remaining)" format instead of "done/total" + * to make it clearer that the total is expected to grow during crawling. + * @param params - The crawl progress parameters. + * @param params.done - Number of URLs completed by the deal queue. + * @param params.total - Total number of URLs in the deal queue (including completed). + * @param params.resumeOffset - Offset from a previous resumed session. + * @param params.externalTotal - Number of external URLs discovered. + * @param params.externalDone - Number of external URLs completed. + * @param params.limit - Number of parallel workers. + * @returns The formatted progress string with ANSI color codes. + */ +export function formatCrawlProgress({ + done, + total, + resumeOffset, + externalTotal, + externalDone, + limit, +}: FormatCrawlProgressParams): string { + const allDone = done + resumeOffset; + const allTotal = total + resumeOffset; + const internalDone = allDone - externalDone; + const internalTotal = allTotal - externalTotal; + const internalRemaining = internalTotal - internalDone; + const externalRemaining = externalTotal - externalDone; + const totalRemaining = internalRemaining + externalRemaining; + const pct = allTotal > 0 ? Math.round((allDone / allTotal) * 100) : 0; + + return ( + c.bold(`Crawling: ${internalDone} done / ${internalTotal} found`) + + c.dim(` (+${externalDone}/${externalTotal} ext)`) + + c.bold(` (${pct}%) [${totalRemaining} remaining]`) + + c.dim(` [${limit} parallel]`) + ); +}