From 92a385250558f25631f9260a7ab9fce594ae6424 Mon Sep 17 00:00:00 2001
From: "brobert (aider)" <borja@brobert.net>
Date: Tue, 1 Apr 2025 15:45:37 +0200
Subject: [PATCH] fix: improve URL deduplication and filtering in blog scraper

---
 scrape-blog.ts | 56 +++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 44 insertions(+), 12 deletions(-)

diff --git a/scrape-blog.ts b/scrape-blog.ts
index c7c9372..bd3decc 100644
--- a/scrape-blog.ts
+++ b/scrape-blog.ts
@@ -75,28 +75,60 @@ async function discoverAllPostUrls(baseUrl: string, maxDepth = 3): Promise<strin
         console.log(`🚫 Skipping external link: ${url}`);
       }
 
-      // Find all links on page
+      // Find and filter links on page
       const links = [...document.querySelectorAll('a[href]')] as HTMLAnchorElement[];
       for (const link of links) {
         const href = link.href;
         if (!href || href.startsWith('#')) continue;
 
-        // Normalize URL and check if it belongs to our domain
-        const urlObj = new URL(href, baseUrl);
-        if (urlObj.hostname === new URL(baseUrl).hostname) {
-          const normalizedUrl = urlObj.toString();
-          if (!discoveredUrls.has(normalizedUrl)) {
+        try {
+          // Normalize URL - remove trailing slashes and query params
+          const urlObj = new URL(href, baseUrl);
+          if (urlObj.hostname !== new URL(baseUrl).hostname) continue;
+          
+          // Clean up URL
+          urlObj.hash = '';
+          urlObj.search = '';
+          let normalizedUrl = urlObj.toString().replace(/\/$/, '');
+          
+          // Skip if already discovered or in queue
+          if (discoveredUrls.has(normalizedUrl) || 
+              queue.some(item => item.url === normalizedUrl)) {
+            continue;
+          }
+
+          // Only add certain types of URLs to queue
+          const path = urlObj.pathname;
+          if (path.startsWith('/tag/') || 
+              path.startsWith('/category/') ||
+              path.startsWith('/page/') ||
+              path.startsWith('/author/') ||
+              path === '/' ||
+              /\d{4}\/\d{2}\/\d{2}/.test(path)) {
             queue.push({url: normalizedUrl, depth: depth + 1});
           }
+        } catch (error) {
+          console.log(`Skipping invalid URL: ${href}`);
         }
       }
 
-      // Check for pagination links
-      const nextPageLink = document.querySelector('a.next') as HTMLAnchorElement;
-      if (nextPageLink?.href) {
-        const nextUrl = new URL(nextPageLink.href, baseUrl).toString();
-        if (!discoveredUrls.has(nextUrl)) {
-          queue.push({url: nextUrl, depth: depth + 1});
+      // Check for pagination links (more strict matching)
+      const paginationLinks = [...document.querySelectorAll('a.page-numbers, a.next, a.prev')] as HTMLAnchorElement[];
+      for (const link of paginationLinks) {
+        try {
+          const urlObj = new URL(link.href, baseUrl);
+          if (urlObj.hostname !== new URL(baseUrl).hostname) continue;
+          
+          urlObj.hash = '';
+          urlObj.search = '';
+          const normalizedUrl = urlObj.toString().replace(/\/$/, '');
+          
+          if (!discoveredUrls.has(normalizedUrl) && 
+              !queue.some(item => item.url === normalizedUrl)) {
+            queue.push({url: normalizedUrl, depth: depth + 1});
+          }
+        } catch (error) {
+          console.log(`Skipping pagination link: ${link.href}`);
         }
       }