cronicas-periodisticas-wp-d.../scrape-blog.ts

import { parseHTML } from 'linkedom';
import { writeFile } from 'fs/promises';
import { setTimeout } from 'timers/promises';

interface BlogPost {
  title: string;
  author: string;
  content: string;
  date?: string;
  url: string;
}

const BLOG_URL = 'https://cronicasperiodisticas.wordpress.com';

// Configure scraping behavior
const SCRAPE_CONFIG = {
  delayBetweenRequests: 4000, // 2 seconds between requests
  maxConcurrentRequests: 2,
  userAgent: 'Mozilla/5.0 (compatible; BlogScraper/1.0; +https://github.com/your-repo)'
};

async function fetchWithRetry(url: string, retries = 3): Promise<string> {
  for (let i = 0; i < retries; i++) {
    try {
      const response = await fetch(url, {
        headers: { 'User-Agent': SCRAPE_CONFIG.userAgent }
      });

      if (!response.ok) throw new Error(`HTTP ${response.status}`);
      return await response.text();
    } catch (error) {
      if (i === retries - 1) throw error;
      const waitTime = 5000 * (i + 1); // Exponential backoff
      console.log(`Retry ${i + 1} for ${url}, waiting ${waitTime}ms...`);
      await setTimeout(waitTime);
    }
  }
  throw new Error('Max retries reached');
}

async function discoverAllPostUrls(baseUrl: string, maxDepth = 3): Promise<string[]> {
  const discoveredUrls = new Set<string>();
  const queue: {url: string; depth: number}[] = [{url: baseUrl, depth: 0}];
  const postUrls = new Set<string>();

  console.log(`Starting URL discovery from ${baseUrl} with max depth ${maxDepth}`);

  while (queue.length > 0) {
    const {url, depth} = queue.shift()!;

    if (depth > maxDepth) continue;
    if (discoveredUrls.has(url)) continue;

    console.log(`\nProcessing URL (depth ${depth})`);
    console.log(`Queue size: ${queue.length}`);
    console.log(`Discovered URLs: ${discoveredUrls.size}`);
    console.log(`Post URLs found: ${postUrls.size}`);
    console.log(`Current URL: ${url}`);
    discoveredUrls.add(url);

    try {
      const html = await fetchWithRetry(url);
      const {document} = parseHTML(html);

      // Check if this is a post URL
      const isOurDomain = new URL(url).hostname === new URL(baseUrl).hostname;
      if (!isOurDomain) {
        console.log(`🚫 Skipping external link: ${url}`);
        continue;
      }

      // Strict check for post URL pattern
      const path = new URL(url).pathname;
      const isPostUrl = /^\/\d{4}\/\d{2}\/\d{2}\/[^/]+\/$/.test(path);

      if (isPostUrl) {
        postUrls.add(url);
        console.log(`✅ Found post URL: ${url}`);
      } else {
        // Log what type of internal link we found
        if (path.startsWith('/tag/')) {
          console.log(`🏷️ Found tag page: ${url}`);
        } else if (path.startsWith('/category/')) {
          console.log(`🗂️ Found category page: ${url}`);
        } else if (path.startsWith('/page/')) {
          console.log(`📄 Found pagination page: ${url}`);
        } else {
          console.log(`🔍 Found internal link: ${url}`);
        }
      }

      // Find and filter links on page
      const links = [...document.querySelectorAll('a[href]')] as HTMLAnchorElement[];
      for (const link of links) {
        const href = link.href;
        if (!href || href.startsWith('#')) continue;

        try {
          // Normalize URL - remove trailing slashes and query params
          const urlObj = new URL(href, baseUrl);
          if (urlObj.hostname !== new URL(baseUrl).hostname) continue;

          // Clean up URL
          urlObj.hash = '';
          urlObj.search = '';
          let normalizedUrl = urlObj.toString().replace(/\/$/, '');

          // Skip if already discovered or in queue
          if (discoveredUrls.has(normalizedUrl) ||
              queue.some(item => item.url === normalizedUrl)) {
            continue;
          }

          // Add pagination, tag, category and post URLs to queue
          const path = urlObj.pathname;
          if (path.startsWith('/page/') ||  // Pagination
              path.startsWith('/tag/') ||   // Tag pages
              path.startsWith('/category/') || // Category pages
              /^\/\d{4}\/\d{2}\/\d{2}\/[^/]+\/$/.test(path)) {  // Post URLs
            queue.push({url: normalizedUrl, depth: depth + 1});
          }
        } catch (error) {
          console.log(`Skipping invalid URL: ${href}`);
        }
      }

      // Check for pagination links (more strict matching)
      const paginationLinks = [...document.querySelectorAll('a.page-numbers, a.next, a.prev')] as HTMLAnchorElement[];
      for (const link of paginationLinks) {
        try {
          const urlObj = new URL(link.href, baseUrl);
          if (urlObj.hostname !== new URL(baseUrl).hostname) continue;

          urlObj.hash = '';
          urlObj.search = '';
          const normalizedUrl = urlObj.toString().replace(/\/$/, '');

          if (!discoveredUrls.has(normalizedUrl) &&
              !queue.some(item => item.url === normalizedUrl)) {
            queue.push({url: normalizedUrl, depth: depth + 1});
          }
        } catch (error) {
          console.log(`Skipping pagination link: ${link.href}`);
        }
      }

      await setTimeout(SCRAPE_CONFIG.delayBetweenRequests);
    } catch (error) {
      console.error(`Error processing ${url}:`, error instanceof Error ? error.message : error);
    }
  }

  console.log(`Discovered ${postUrls.size} potential post URLs`);
  return Array.from(postUrls);
}

async function fetchBlogPosts(): Promise<BlogPost[]> {
  console.log(`Starting blog post scraping from ${BLOG_URL}...`);
  const postUrls = await discoverAllPostUrls(BLOG_URL);

  if (postUrls.length === 0) {
    console.warn('No post URLs discovered!');
    return [];
  }

  // Process posts with limited concurrency and delays
  const processPost = async (url: string): Promise<BlogPost | null> => {
    await setTimeout(SCRAPE_CONFIG.delayBetweenRequests);
    console.log(`Processing post: ${url}`);

    try {
      const postHtml = await fetchWithRetry(url);
      const { document: postDoc } = parseHTML(postHtml);

      const title = postDoc.querySelector('h2.pagetitle')?.textContent?.trim() || 'Untitled';
      const content = postDoc.querySelector('div.entry')?.innerHTML || '';

      // Extract author from categories - this might need adjustment
      const author = [...postDoc.querySelectorAll('a[rel="category tag"]')]
        .map(el => el.textContent?.trim())
        .filter(Boolean)
        .join(', ') || 'Unknown';

      return {
        title,
        author,
        content,
        url
      };
    } catch (error) {
      if (error instanceof Error) {
        console.error(`Failed to process ${url}: ${error.message}`);
      } else {
        console.error(`Failed to process ${url}:`, error);
      }
      return null;
    }
  };

  // Process posts with progress logging
  const posts: BlogPost[] = [];
  let processed = 0;
  const total = postUrls.length;

  for (const url of postUrls) {
    processed++;
    try {
      console.log(`[${processed}/${total}] Processing post: ${url}`);
      const post = await processPost(url);
      if (post) {
        posts.push(post);
        console.log(`[${processed}/${total}] Successfully processed: ${post.title}`);
      }
    } catch (error) {
      console.error(`[${processed}/${total}] Failed to process ${url}:`, error instanceof Error ? error.message : error);
    }
  }

  console.log(`Finished processing. Success rate: ${posts.length}/${postUrls.length} (${Math.round((posts.length/postUrls.length)*100)}%)`);
  return posts;
}

async function main() {
  const posts = await fetchBlogPosts();
  await writeFile('posts.json', JSON.stringify(posts, null, 2));
  console.log(`Saved ${posts.length} posts to posts.json`);
}

main().catch(console.error);