import { parseHTML } from 'linkedom'; import { writeFile } from 'fs/promises'; import { setTimeout } from 'timers/promises'; interface BlogPost { title: string; author: string; content: string; date?: string; url: string; } const BLOG_URL = 'https://cronicasperiodisticas.wordpress.com'; // Configure scraping behavior const SCRAPE_CONFIG = { delayBetweenRequests: 2000, // 2 seconds between requests maxConcurrentRequests: 2, userAgent: 'Mozilla/5.0 (compatible; BlogScraper/1.0; +https://github.com/your-repo)' }; async function fetchWithRetry(url: string, retries = 3): Promise { for (let i = 0; i < retries; i++) { try { const response = await fetch(url, { headers: { 'User-Agent': SCRAPE_CONFIG.userAgent } }); if (!response.ok) throw new Error(`HTTP ${response.status}`); return await response.text(); } catch (error) { if (i === retries - 1) throw error; const waitTime = 5000 * (i + 1); // Exponential backoff console.log(`Retry ${i + 1} for ${url}, waiting ${waitTime}ms...`); await setTimeout(waitTime); } } throw new Error('Max retries reached'); } async function fetchBlogPosts(): Promise { console.log(`Fetching blog posts from ${BLOG_URL}...`); const html = await fetchWithRetry(BLOG_URL); const { document } = parseHTML(html); // Extract and filter post URLs const postLinks = [...document.querySelectorAll('div.posttitle a')] as HTMLAnchorElement[]; const postUrls = postLinks .map(link => link.href) .filter(url => { // Only include URLs that look like actual posts const isPost = /\/\d{4}\/\d{2}\/\d{2}\//.test(url); if (!isPost) { console.log(`Skipping non-post URL: ${url}`); } return isPost; }); // Process posts with limited concurrency and delays const processPost = async (url: string): Promise => { await setTimeout(SCRAPE_CONFIG.delayBetweenRequests); console.log(`Processing post: ${url}`); try { const postHtml = await fetchWithRetry(url); const { document: postDoc } = parseHTML(postHtml); const title = postDoc.querySelector('h2.pagetitle')?.textContent?.trim() || 'Untitled'; const content = postDoc.querySelector('div.entry')?.innerHTML || ''; // Extract author from categories - this might need adjustment const author = [...postDoc.querySelectorAll('a[rel="category tag"]')] .map(el => el.textContent?.trim()) .filter(Boolean) .join(', ') || 'Unknown'; return { title, author, content, url }; } catch (error) { if (error instanceof Error) { console.error(`Failed to process ${url}: ${error.message}`); } else { console.error(`Failed to process ${url}:`, error); } return null; } }; // Process posts with limited concurrency const results = await Promise.allSettled( postUrls.slice(0, 10).map(processPost) // Limit to 10 posts for initial testing ); const posts: BlogPost[] = results .filter((result): result is PromiseFulfilledResult => result.status === 'fulfilled' && result.value !== null ) .map(result => result.value); console.log(`Successfully processed ${posts.length}/${postUrls.length} posts`); return posts; } async function main() { const posts = await fetchBlogPosts(); await writeFile('posts.json', JSON.stringify(posts, null, 2)); console.log(`Saved ${posts.length} posts to posts.json`); } main().catch(console.error);