feat: add rate limiting and error handling to blog scraper

9 months ago · a872a18077
parent abf216867f
commit a872a18077
1 changed files with 47 additions and 8 deletions
--- a/scrape-blog.ts
+++ b/scrape-blog.ts
@ -1,5 +1,6 @@
 import { parseHTML } from 'linkedom';
 import { writeFile } from 'fs/promises';
 import { setTimeout } from 'timers/promises';
 interface BlogPost {
  title: string;
@ -11,10 +12,35 @@ interface BlogPost {
 const BLOG_URL = 'https://cronicasperiodisticas.wordpress.com';
 // Configure scraping behavior
 const SCRAPE_CONFIG = {
  delayBetweenRequests: 2000, // 2 seconds between requests
  maxConcurrentRequests: 2,
  userAgent: 'Mozilla/5.0 (compatible; BlogScraper/1.0; +https://github.com/your-repo)'
 };
 async function fetchWithRetry(url: string, retries = 3): Promise<string> {
  for (let i = 0; i < retries; i++) {
    try {
      const response = await fetch(url, {
        headers: { 'User-Agent': SCRAPE_CONFIG.userAgent }
      });
      if (!response.ok) throw new Error(`HTTP ${response.status}`);
      return await response.text();
    } catch (error) {
      if (i === retries - 1) throw error;
      const waitTime = 5000 * (i + 1); // Exponential backoff
      console.log(`Retry ${i + 1} for ${url}, waiting ${waitTime}ms...`);
      await setTimeout(waitTime);
    }
  }
  throw new Error('Max retries reached');
 }
 async function fetchBlogPosts(): Promise<BlogPost[]> {
  console.log(`Fetching blog posts from ${BLOG_URL}...`);
-  const response = await fetch(BLOG_URL);
+  const html = await fetchWithRetry(BLOG_URL);
  const html = await response.text();
  const { document } = parseHTML(html);
  // Extract post URLs - this selector might need adjustment
@ -23,11 +49,13 @@ async function fetchBlogPosts(): Promise<BlogPost[]> {
  const posts: BlogPost[] = [];
-  for (const url of postUrls) {
+  // Process posts with limited concurrency and delays
  const processPost = async (url: string): Promise<BlogPost | null> => {
    await setTimeout(SCRAPE_CONFIG.delayBetweenRequests);
    console.log(`Processing post: ${url}`);
    try {
-      const postResponse = await fetch(url);
+      const postHtml = await fetchWithRetry(url);
      const postHtml = await postResponse.text();
      const { document: postDoc } = parseHTML(postHtml);
      const title = postDoc.querySelector('h2.pagetitle')?.textContent?.trim() || 'Untitled';
@ -39,17 +67,28 @@ async function fetchBlogPosts(): Promise<BlogPost[]> {
        .filter(Boolean)
        .join(', ') || 'Unknown';
-      posts.push({
+      return {
        title,
        author,
        content,
        url
-      });
+      };
    } catch (error) {
      console.error(`Failed to process ${url}:`, error);
      return null;
    }
-  }
+  };
  // Process posts with limited concurrency
  const results = await Promise.allSettled(
    postUrls.slice(0, 10).map(processPost) // Limit to 10 posts for initial testing
  );
  const posts = results
    .filter(result => result.status === 'fulfilled' && result.value !== null)
    .map(result => (result as PromiseFulfilledResult<BlogPost>).value);
  console.log(`Successfully processed ${posts.length}/${postUrls.length} posts`);
  return posts;
 }