import { parseHTML } from 'linkedom'; import { writeFile } from 'fs/promises'; import { setTimeout } from 'timers/promises'; interface BlogPost { title: string; author: string; content: string; date?: string; url: string; } const BLOG_URL = 'https://cronicasperiodisticas.wordpress.com'; // Configure scraping behavior const SCRAPE_CONFIG = { delayBetweenRequests: 4000, // 2 seconds between requests maxConcurrentRequests: 2, userAgent: 'Mozilla/5.0 (compatible; BlogScraper/1.0; +https://github.com/your-repo)' }; async function fetchWithRetry(url: string, retries = 3): Promise { for (let i = 0; i < retries; i++) { try { const response = await fetch(url, { headers: { 'User-Agent': SCRAPE_CONFIG.userAgent } }); if (!response.ok) throw new Error(`HTTP ${response.status}`); return await response.text(); } catch (error) { if (i === retries - 1) throw error; const waitTime = 5000 * (i + 1); // Exponential backoff console.log(`Retry ${i + 1} for ${url}, waiting ${waitTime}ms...`); await setTimeout(waitTime); } } throw new Error('Max retries reached'); } async function discoverAllPostUrls(baseUrl: string, maxDepth = 3): Promise { const discoveredUrls = new Set(); const queue: {url: string; depth: number}[] = [{url: baseUrl, depth: 0}]; const postUrls = new Set(); console.log(`Starting URL discovery from ${baseUrl} with max depth ${maxDepth}`); while (queue.length > 0) { const {url, depth} = queue.shift()!; if (depth > maxDepth) continue; if (discoveredUrls.has(url)) continue; console.log(`Processing URL (depth ${depth}): ${url}`); discoveredUrls.add(url); try { const html = await fetchWithRetry(url); const {document} = parseHTML(html); // Check if this is a post URL if (/\d{4}\/\d{2}\/\d{2}\//.test(url)) { postUrls.add(url); console.log(`Found post URL: ${url}`); } // Find all links on page const links = [...document.querySelectorAll('a[href]')] as HTMLAnchorElement[]; for (const link of links) { const href = link.href; if (!href || href.startsWith('#')) continue; // Normalize URL and check if it belongs to our domain const urlObj = new URL(href, baseUrl); if (urlObj.hostname === new URL(baseUrl).hostname) { const normalizedUrl = urlObj.toString(); if (!discoveredUrls.has(normalizedUrl)) { queue.push({url: normalizedUrl, depth: depth + 1}); } } } // Check for pagination links const nextPageLink = document.querySelector('a.next') as HTMLAnchorElement; if (nextPageLink?.href) { const nextUrl = new URL(nextPageLink.href, baseUrl).toString(); if (!discoveredUrls.has(nextUrl)) { queue.push({url: nextUrl, depth: depth + 1}); } } await setTimeout(SCRAPE_CONFIG.delayBetweenRequests); } catch (error) { console.error(`Error processing ${url}:`, error instanceof Error ? error.message : error); } } console.log(`Discovered ${postUrls.size} potential post URLs`); return Array.from(postUrls); } async function fetchBlogPosts(): Promise { console.log(`Starting blog post scraping from ${BLOG_URL}...`); const postUrls = await discoverAllPostUrls(BLOG_URL); if (postUrls.length === 0) { console.warn('No post URLs discovered!'); return []; } // Process posts with limited concurrency and delays const processPost = async (url: string): Promise => { await setTimeout(SCRAPE_CONFIG.delayBetweenRequests); console.log(`Processing post: ${url}`); try { const postHtml = await fetchWithRetry(url); const { document: postDoc } = parseHTML(postHtml); const title = postDoc.querySelector('h2.pagetitle')?.textContent?.trim() || 'Untitled'; const content = postDoc.querySelector('div.entry')?.innerHTML || ''; // Extract author from categories - this might need adjustment const author = [...postDoc.querySelectorAll('a[rel="category tag"]')] .map(el => el.textContent?.trim()) .filter(Boolean) .join(', ') || 'Unknown'; return { title, author, content, url }; } catch (error) { if (error instanceof Error) { console.error(`Failed to process ${url}: ${error.message}`); } else { console.error(`Failed to process ${url}:`, error); } return null; } }; // Process posts with progress logging const posts: BlogPost[] = []; let processed = 0; const total = postUrls.length; for (const url of postUrls) { processed++; try { console.log(`[${processed}/${total}] Processing post: ${url}`); const post = await processPost(url); if (post) { posts.push(post); console.log(`[${processed}/${total}] Successfully processed: ${post.title}`); } } catch (error) { console.error(`[${processed}/${total}] Failed to process ${url}:`, error instanceof Error ? error.message : error); } } console.log(`Finished processing. Success rate: ${posts.length}/${postUrls.length} (${Math.round((posts.length/postUrls.length)*100)}%)`); return posts; } async function main() { const posts = await fetchBlogPosts(); await writeFile('posts.json', JSON.stringify(posts, null, 2)); console.log(`Saved ${posts.length} posts to posts.json`); } main().catch(console.error);