fix: improve URL deduplication and filtering in blog scraper

7 months ago · 92a3852505
parent 53b0551942
commit 92a3852505
1 changed files with 44 additions and 12 deletions
--- a/scrape-blog.ts
+++ b/scrape-blog.ts
@ -75,28 +75,60 @@ async function discoverAllPostUrls(baseUrl: string, maxDepth = 3): Promise<strin
        console.log(`🚫 Skipping external link: ${url}`);
      }

-      // Find all links on page
+      // Find and filter links on page
      const links = [...document.querySelectorAll('a[href]')] as HTMLAnchorElement[];
      for (const link of links) {
        const href = link.href;
        if (!href || href.startsWith('#')) continue;

-        // Normalize URL and check if it belongs to our domain
-        const urlObj = new URL(href, baseUrl);
-        if (urlObj.hostname === new URL(baseUrl).hostname) {
-          const normalizedUrl = urlObj.toString();
-          if (!discoveredUrls.has(normalizedUrl)) {
+        try {
+          // Normalize URL - remove trailing slashes and query params
+          const urlObj = new URL(href, baseUrl);
+          if (urlObj.hostname !== new URL(baseUrl).hostname) continue;
+          
+          // Clean up URL
+          urlObj.hash = '';
+          urlObj.search = '';
+          let normalizedUrl = urlObj.toString().replace(/\/$/, '');
+          
+          // Skip if already discovered or in queue
+          if (discoveredUrls.has(normalizedUrl) || 
+              queue.some(item => item.url === normalizedUrl)) {
+            continue;
+          }
+
+          // Only add certain types of URLs to queue
+          const path = urlObj.pathname;
+          if (path.startsWith('/tag/') || 
+              path.startsWith('/category/') ||
+              path.startsWith('/page/') ||
+              path.startsWith('/author/') ||
+              path === '/' ||
+              /\d{4}\/\d{2}\/\d{2}/.test(path)) {
            queue.push({url: normalizedUrl, depth: depth + 1});
          }
+        } catch (error) {
+          console.log(`Skipping invalid URL: ${href}`);
        }
      }

-      // Check for pagination links
-      const nextPageLink = document.querySelector('a.next') as HTMLAnchorElement;
-      if (nextPageLink?.href) {
-        const nextUrl = new URL(nextPageLink.href, baseUrl).toString();
-        if (!discoveredUrls.has(nextUrl)) {
-          queue.push({url: nextUrl, depth: depth + 1});
+      // Check for pagination links (more strict matching)
+      const paginationLinks = [...document.querySelectorAll('a.page-numbers, a.next, a.prev')] as HTMLAnchorElement[];
+      for (const link of paginationLinks) {
+        try {
+          const urlObj = new URL(link.href, baseUrl);
+          if (urlObj.hostname !== new URL(baseUrl).hostname) continue;
+          
+          urlObj.hash = '';
+          urlObj.search = '';
+          const normalizedUrl = urlObj.toString().replace(/\/$/, '');
+          
+          if (!discoveredUrls.has(normalizedUrl) && 
+              !queue.some(item => item.url === normalizedUrl)) {
+            queue.push({url: normalizedUrl, depth: depth + 1});
+          }
+        } catch (error) {
+          console.log(`Skipping pagination link: ${link.href}`);
        }
      }