feat: improve URL discovery logging and filtering

main
brobert (aider) 3 months ago
parent 30e58286eb
commit f0db6bd7b9

@ -51,17 +51,28 @@ async function discoverAllPostUrls(baseUrl: string, maxDepth = 3): Promise<strin
if (depth > maxDepth) continue;
if (discoveredUrls.has(url)) continue;
console.log(`Processing URL (depth ${depth}): ${url}`);
console.log(`\nProcessing URL (depth ${depth})`);
console.log(`Queue size: ${queue.length}`);
console.log(`Discovered URLs: ${discoveredUrls.size}`);
console.log(`Post URLs found: ${postUrls.size}`);
console.log(`Current URL: ${url}`);
discoveredUrls.add(url);
try {
const html = await fetchWithRetry(url);
const {document} = parseHTML(html);
// Check if this is a post URL
if (/\d{4}\/\d{2}\/\d{2}\//.test(url)) {
// Check if this is a post URL (must match both domain and date pattern)
const isOurDomain = new URL(url).hostname === new URL(baseUrl).hostname;
const isPostUrl = /\d{4}\/\d{2}\/\d{2}\//.test(url);
if (isOurDomain && isPostUrl) {
postUrls.add(url);
console.log(`Found post URL: ${url}`);
console.log(`✅ Found post URL: ${url}`);
} else if (isOurDomain) {
console.log(`🔍 Found internal link: ${url}`);
} else {
console.log(`🚫 Skipping external link: ${url}`);
}
// Find all links on page

Loading…
Cancel
Save