feat: add recursive URL discovery with improved logging

3 months ago · 30e58286eb
parent b43f7db8bb
commit 30e58286eb
1 changed files with 86 additions and 23 deletions
--- a/scrape-blog.ts
+++ b/scrape-blog.ts
@ -38,23 +38,75 @@ async function fetchWithRetry(url: string, retries = 3): Promise<string> {
  throw new Error('Max retries reached');
 }

+async function discoverAllPostUrls(baseUrl: string, maxDepth = 3): Promise<string[]> {
+  const discoveredUrls = new Set<string>();
+  const queue: {url: string; depth: number}[] = [{url: baseUrl, depth: 0}];
+  const postUrls = new Set<string>();
+
+  console.log(`Starting URL discovery from ${baseUrl} with max depth ${maxDepth}`);
+
+  while (queue.length > 0) {
+    const {url, depth} = queue.shift()!;
+    
+    if (depth > maxDepth) continue;
+    if (discoveredUrls.has(url)) continue;
+
+    console.log(`Processing URL (depth ${depth}): ${url}`);
+    discoveredUrls.add(url);
+
+    try {
+      const html = await fetchWithRetry(url);
+      const {document} = parseHTML(html);
+
+      // Check if this is a post URL
+      if (/\d{4}\/\d{2}\/\d{2}\//.test(url)) {
+        postUrls.add(url);
+        console.log(`Found post URL: ${url}`);
+      }
+
+      // Find all links on page
+      const links = [...document.querySelectorAll('a[href]')] as HTMLAnchorElement[];
+      for (const link of links) {
+        const href = link.href;
+        if (!href || href.startsWith('#')) continue;
+
+        // Normalize URL and check if it belongs to our domain
+        const urlObj = new URL(href, baseUrl);
+        if (urlObj.hostname === new URL(baseUrl).hostname) {
+          const normalizedUrl = urlObj.toString();
+          if (!discoveredUrls.has(normalizedUrl)) {
+            queue.push({url: normalizedUrl, depth: depth + 1});
+          }
+        }
+      }
+
+      // Check for pagination links
+      const nextPageLink = document.querySelector('a.next') as HTMLAnchorElement;
+      if (nextPageLink?.href) {
+        const nextUrl = new URL(nextPageLink.href, baseUrl).toString();
+        if (!discoveredUrls.has(nextUrl)) {
+          queue.push({url: nextUrl, depth: depth + 1});
+        }
+      }
+
+      await setTimeout(SCRAPE_CONFIG.delayBetweenRequests);
+    } catch (error) {
+      console.error(`Error processing ${url}:`, error instanceof Error ? error.message : error);
+    }
+  }
+
+  console.log(`Discovered ${postUrls.size} potential post URLs`);
+  return Array.from(postUrls);
+}
+
 async function fetchBlogPosts(): Promise<BlogPost[]> {
-  console.log(`Fetching blog posts from ${BLOG_URL}...`);
-  const html = await fetchWithRetry(BLOG_URL);
-  const { document } = parseHTML(html);
-
-  // Extract and filter post URLs
-  const postLinks = [...document.querySelectorAll('div.posttitle a')] as HTMLAnchorElement[];
-  const postUrls = postLinks
-    .map(link => link.href)
-    .filter(url => {
-      // Only include URLs that look like actual posts
-      const isPost = /\/\d{4}\/\d{2}\/\d{2}\//.test(url);
-      if (!isPost) {
-        console.log(`Skipping non-post URL: ${url}`);
-      }
-      return isPost;
-    });
+  console.log(`Starting blog post scraping from ${BLOG_URL}...`);
+  const postUrls = await discoverAllPostUrls(BLOG_URL);
+  
+  if (postUrls.length === 0) {
+    console.warn('No post URLs discovered!');
+    return [];
+  }

  // Process posts with limited concurrency and delays
  const processPost = async (url: string): Promise<BlogPost | null> => {
@ -90,15 +142,26 @@ async function fetchBlogPosts(): Promise<BlogPost[]> {
    }
  };

-  // Process posts sequentially with delay
-  const results = [];
-  for (const url of postUrls) { // Process all posts
-    results.push(await processPost(url));
-  }
+  // Process posts with progress logging
+  const posts: BlogPost[] = [];
+  let processed = 0;
+  const total = postUrls.length;

-  const posts: BlogPost[] = results.filter((post): post is BlogPost => post !== null);
+  for (const url of postUrls) {
+    processed++;
+    try {
+      console.log(`[${processed}/${total}] Processing post: ${url}`);
+      const post = await processPost(url);
+      if (post) {
+        posts.push(post);
+        console.log(`[${processed}/${total}] Successfully processed: ${post.title}`);
+      }
+    } catch (error) {
+      console.error(`[${processed}/${total}] Failed to process ${url}:`, error instanceof Error ? error.message : error);
+    }
+  }

-  console.log(`Successfully processed ${posts.length}/${postUrls.length} posts`);
+  console.log(`Finished processing. Success rate: ${posts.length}/${postUrls.length} (${Math.round((posts.length/postUrls.length)*100)}%)`);
  return posts;
 }