cronicas-periodisticas-wp-d.../scrape-blog.ts

import { parseHTML } from 'linkedom';
import { writeFile } from 'fs/promises';
import { setTimeout } from 'timers/promises';

interface BlogPost {
  title: string;
  author: string;
  content: string;
  date?: string;
  url: string;
}

const BLOG_URL = 'https://cronicasperiodisticas.wordpress.com';

// Configure scraping behavior
const SCRAPE_CONFIG = {
  delayBetweenRequests: 2000, // 2 seconds between requests
  maxConcurrentRequests: 2,
  userAgent: 'Mozilla/5.0 (compatible; BlogScraper/1.0; +https://github.com/your-repo)'
};

async function fetchWithRetry(url: string, retries = 3): Promise<string> {
  for (let i = 0; i < retries; i++) {
    try {
      const response = await fetch(url, {
        headers: { 'User-Agent': SCRAPE_CONFIG.userAgent }
      });

      if (!response.ok) throw new Error(`HTTP ${response.status}`);
      return await response.text();
    } catch (error) {
      if (i === retries - 1) throw error;
      const waitTime = 5000 * (i + 1); // Exponential backoff
      console.log(`Retry ${i + 1} for ${url}, waiting ${waitTime}ms...`);
      await setTimeout(waitTime);
    }
  }
  throw new Error('Max retries reached');
}

async function fetchBlogPosts(): Promise<BlogPost[]> {
  console.log(`Fetching blog posts from ${BLOG_URL}...`);
  const html = await fetchWithRetry(BLOG_URL);
  const { document } = parseHTML(html);

  // Extract and filter post URLs
  const postLinks = [...document.querySelectorAll('div.posttitle a')] as HTMLAnchorElement[];
  const postUrls = postLinks
    .map(link => link.href)
    .filter(url => {
      // Only include URLs that look like actual posts
      const isPost = /\/\d{4}\/\d{2}\/\d{2}\//.test(url);
      if (!isPost) {
        console.log(`Skipping non-post URL: ${url}`);
      }
      return isPost;
    });

  // Process posts with limited concurrency and delays
  const processPost = async (url: string): Promise<BlogPost | null> => {
    await setTimeout(SCRAPE_CONFIG.delayBetweenRequests);
    console.log(`Processing post: ${url}`);

    try {
      const postHtml = await fetchWithRetry(url);
      const { document: postDoc } = parseHTML(postHtml);

      const title = postDoc.querySelector('h2.pagetitle')?.textContent?.trim() || 'Untitled';
      const content = postDoc.querySelector('div.entry')?.innerHTML || '';

      // Extract author from categories - this might need adjustment
      const author = [...postDoc.querySelectorAll('a[rel="category tag"]')]
        .map(el => el.textContent?.trim())
        .filter(Boolean)
        .join(', ') || 'Unknown';

      return {
        title,
        author,
        content,
        url
      };
    } catch (error) {
      if (error instanceof Error) {
        console.error(`Failed to process ${url}: ${error.message}`);
      } else {
        console.error(`Failed to process ${url}:`, error);
      }
      return null;
    }
  };

  // Process posts sequentially with delay
  const results = [];
  for (const url of postUrls) { // Process all posts
    results.push(await processPost(url));
  }

  const posts: BlogPost[] = results.filter((post): post is BlogPost => post !== null);

  console.log(`Successfully processed ${posts.length}/${postUrls.length} posts`);
  return posts;
}

async function main() {
  const posts = await fetchBlogPosts();
  await writeFile('posts.json', JSON.stringify(posts, null, 2));
  console.log(`Saved ${posts.length} posts to posts.json`);
}

main().catch(console.error);