From 92a385250558f25631f9260a7ab9fce594ae6424 Mon Sep 17 00:00:00 2001 From: "brobert (aider)" Date: Tue, 1 Apr 2025 15:45:37 +0200 Subject: [PATCH] fix: improve URL deduplication and filtering in blog scraper --- scrape-blog.ts | 56 +++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 44 insertions(+), 12 deletions(-) diff --git a/scrape-blog.ts b/scrape-blog.ts index c7c9372..bd3decc 100644 --- a/scrape-blog.ts +++ b/scrape-blog.ts @@ -75,28 +75,60 @@ async function discoverAllPostUrls(baseUrl: string, maxDepth = 3): Promise item.url === normalizedUrl)) { + continue; + } + + // Only add certain types of URLs to queue + const path = urlObj.pathname; + if (path.startsWith('/tag/') || + path.startsWith('/category/') || + path.startsWith('/page/') || + path.startsWith('/author/') || + path === '/' || + /\d{4}\/\d{2}\/\d{2}/.test(path)) { queue.push({url: normalizedUrl, depth: depth + 1}); } + } catch (error) { + console.log(`Skipping invalid URL: ${href}`); } } - // Check for pagination links - const nextPageLink = document.querySelector('a.next') as HTMLAnchorElement; - if (nextPageLink?.href) { - const nextUrl = new URL(nextPageLink.href, baseUrl).toString(); - if (!discoveredUrls.has(nextUrl)) { - queue.push({url: nextUrl, depth: depth + 1}); + // Check for pagination links (more strict matching) + const paginationLinks = [...document.querySelectorAll('a.page-numbers, a.next, a.prev')] as HTMLAnchorElement[]; + for (const link of paginationLinks) { + try { + const urlObj = new URL(link.href, baseUrl); + if (urlObj.hostname !== new URL(baseUrl).hostname) continue; + + urlObj.hash = ''; + urlObj.search = ''; + const normalizedUrl = urlObj.toString().replace(/\/$/, ''); + + if (!discoveredUrls.has(normalizedUrl) && + !queue.some(item => item.url === normalizedUrl)) { + queue.push({url: normalizedUrl, depth: depth + 1}); + } + } catch (error) { + console.log(`Skipping pagination link: ${link.href}`); } }