feat: improve URL discovery logging and filtering

main
brobert (aider) 3 months ago
parent 30e58286eb
commit f0db6bd7b9

@ -51,17 +51,28 @@ async function discoverAllPostUrls(baseUrl: string, maxDepth = 3): Promise<strin
if (depth > maxDepth) continue; if (depth > maxDepth) continue;
if (discoveredUrls.has(url)) continue; if (discoveredUrls.has(url)) continue;
console.log(`Processing URL (depth ${depth}): ${url}`); console.log(`\nProcessing URL (depth ${depth})`);
console.log(`Queue size: ${queue.length}`);
console.log(`Discovered URLs: ${discoveredUrls.size}`);
console.log(`Post URLs found: ${postUrls.size}`);
console.log(`Current URL: ${url}`);
discoveredUrls.add(url); discoveredUrls.add(url);
try { try {
const html = await fetchWithRetry(url); const html = await fetchWithRetry(url);
const {document} = parseHTML(html); const {document} = parseHTML(html);
// Check if this is a post URL // Check if this is a post URL (must match both domain and date pattern)
if (/\d{4}\/\d{2}\/\d{2}\//.test(url)) { const isOurDomain = new URL(url).hostname === new URL(baseUrl).hostname;
const isPostUrl = /\d{4}\/\d{2}\/\d{2}\//.test(url);
if (isOurDomain && isPostUrl) {
postUrls.add(url); postUrls.add(url);
console.log(`Found post URL: ${url}`); console.log(`✅ Found post URL: ${url}`);
} else if (isOurDomain) {
console.log(`🔍 Found internal link: ${url}`);
} else {
console.log(`🚫 Skipping external link: ${url}`);
} }
// Find all links on page // Find all links on page

Loading…
Cancel
Save