From f0db6bd7b9948e3f1792d6b651b2276e1469ba87 Mon Sep 17 00:00:00 2001 From: "brobert (aider)" Date: Tue, 1 Apr 2025 15:42:03 +0200 Subject: [PATCH] feat: improve URL discovery logging and filtering --- scrape-blog.ts | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/scrape-blog.ts b/scrape-blog.ts index 1bc779d..c7c9372 100644 --- a/scrape-blog.ts +++ b/scrape-blog.ts @@ -51,17 +51,28 @@ async function discoverAllPostUrls(baseUrl: string, maxDepth = 3): Promise maxDepth) continue; if (discoveredUrls.has(url)) continue; - console.log(`Processing URL (depth ${depth}): ${url}`); + console.log(`\nProcessing URL (depth ${depth})`); + console.log(`Queue size: ${queue.length}`); + console.log(`Discovered URLs: ${discoveredUrls.size}`); + console.log(`Post URLs found: ${postUrls.size}`); + console.log(`Current URL: ${url}`); discoveredUrls.add(url); try { const html = await fetchWithRetry(url); const {document} = parseHTML(html); - // Check if this is a post URL - if (/\d{4}\/\d{2}\/\d{2}\//.test(url)) { + // Check if this is a post URL (must match both domain and date pattern) + const isOurDomain = new URL(url).hostname === new URL(baseUrl).hostname; + const isPostUrl = /\d{4}\/\d{2}\/\d{2}\//.test(url); + + if (isOurDomain && isPostUrl) { postUrls.add(url); - console.log(`Found post URL: ${url}`); + console.log(`✅ Found post URL: ${url}`); + } else if (isOurDomain) { + console.log(`🔍 Found internal link: ${url}`); + } else { + console.log(`🚫 Skipping external link: ${url}`); } // Find all links on page