import { parseHTML } from 'linkedom'; import { writeFile } from 'fs/promises'; import { setTimeout } from 'timers/promises'; interface BlogPost { title: string; author: string; content: string; date?: string; url: string; } const BLOG_URL = 'https://cronicasperiodisticas.wordpress.com'; // Configure scraping behavior const SCRAPE_CONFIG = { delayBetweenRequests: 4000, // 2 seconds between requests maxConcurrentRequests: 2, userAgent: 'Mozilla/5.0 (compatible; BlogScraper/1.0; +https://github.com/your-repo)' }; async function fetchWithRetry(url: string, retries = 3): Promise { for (let i = 0; i < retries; i++) { try { const response = await fetch(url, { headers: { 'User-Agent': SCRAPE_CONFIG.userAgent } }); if (!response.ok) throw new Error(`HTTP ${response.status}`); return await response.text(); } catch (error) { if (i === retries - 1) throw error; const waitTime = 5000 * (i + 1); // Exponential backoff console.log(`Retry ${i + 1} for ${url}, waiting ${waitTime}ms...`); await setTimeout(waitTime); } } throw new Error('Max retries reached'); } async function discoverAllPostUrls(baseUrl: string, maxDepth = 3): Promise { const discoveredUrls = new Set(); const queue: {url: string; depth: number}[] = [{url: baseUrl, depth: 0}]; const postUrls = new Set(); console.log(`Starting URL discovery from ${baseUrl} with max depth ${maxDepth}`); while (queue.length > 0) { const {url, depth} = queue.shift()!; if (depth > maxDepth) continue; if (discoveredUrls.has(url)) continue; console.log(`\nProcessing URL (depth ${depth})`); console.log(`Queue size: ${queue.length}`); console.log(`Discovered URLs: ${discoveredUrls.size}`); console.log(`Post URLs found: ${postUrls.size}`); console.log(`Current URL: ${url}`); discoveredUrls.add(url); try { const html = await fetchWithRetry(url); const {document} = parseHTML(html); // Check if this is a post URL const isOurDomain = new URL(url).hostname === new URL(baseUrl).hostname; if (!isOurDomain) { console.log(`🚫 Skipping external link: ${url}`); continue; } // Check for post URL pattern (with or without trailing slash) const path = new URL(url).pathname; const isPostUrl = /^\/\d{4}\/\d{2}\/\d{2}\/[^/]+(\/)?$/.test(path); if (isPostUrl) { postUrls.add(url); console.log(`✅ Found post URL: ${url}`); } else { // Log what type of internal link we found if (path.startsWith('/tag/')) { console.log(`🏷️ Found tag page: ${url}`); } else if (path.startsWith('/category/')) { console.log(`🗂️ Found category page: ${url}`); } else if (path.startsWith('/page/')) { console.log(`📄 Found pagination page: ${url}`); } else { console.log(`🔍 Found internal link: ${url}`); } } // Find and filter links on page const links = [...document.querySelectorAll('a[href]')] as HTMLAnchorElement[]; for (const link of links) { const href = link.href; if (!href || href.startsWith('#')) continue; try { // Normalize URL - remove trailing slashes and query params const urlObj = new URL(href, baseUrl); if (urlObj.hostname !== new URL(baseUrl).hostname) continue; // Clean up URL urlObj.hash = ''; urlObj.search = ''; let normalizedUrl = urlObj.toString().replace(/\/$/, ''); // Skip if already discovered or in queue if (discoveredUrls.has(normalizedUrl) || queue.some(item => item.url === normalizedUrl)) { continue; } // Add pagination, tag, category and post URLs to queue const path = urlObj.pathname; if (path.startsWith('/page/') || // Pagination path.startsWith('/tag/') || // Tag pages path.startsWith('/category/') || // Category pages /^\/\d{4}\/\d{2}\/\d{2}\/[^/]+(\/)?$/.test(path)) { // Post URLs queue.push({url: normalizedUrl, depth: depth + 1}); } } catch (error) { console.log(`Skipping invalid URL: ${href}`); } } // Check for pagination links (more strict matching) const paginationLinks = [...document.querySelectorAll('a.page-numbers, a.next, a.prev')] as HTMLAnchorElement[]; for (const link of paginationLinks) { try { const urlObj = new URL(link.href, baseUrl); if (urlObj.hostname !== new URL(baseUrl).hostname) continue; urlObj.hash = ''; urlObj.search = ''; const normalizedUrl = urlObj.toString().replace(/\/$/, ''); if (!discoveredUrls.has(normalizedUrl) && !queue.some(item => item.url === normalizedUrl)) { queue.push({url: normalizedUrl, depth: depth + 1}); } } catch (error) { console.log(`Skipping pagination link: ${link.href}`); } } await setTimeout(SCRAPE_CONFIG.delayBetweenRequests); } catch (error) { console.error(`Error processing ${url}:`, error instanceof Error ? error.message : error); } } console.log(`Discovered ${postUrls.size} potential post URLs`); return Array.from(postUrls); } async function fetchBlogPosts(): Promise { console.log(`Starting blog post scraping from ${BLOG_URL}...`); const postUrls = await discoverAllPostUrls(BLOG_URL); if (postUrls.length === 0) { console.warn('No post URLs discovered!'); return []; } // Process posts with limited concurrency and delays const processPost = async (url: string): Promise => { await setTimeout(SCRAPE_CONFIG.delayBetweenRequests); console.log(`Processing post: ${url}`); try { const postHtml = await fetchWithRetry(url); const { document: postDoc } = parseHTML(postHtml); const title = postDoc.querySelector('h2.pagetitle')?.textContent?.trim() || 'Untitled'; const content = postDoc.querySelector('div.entry')?.innerHTML || ''; // Extract author from categories - this might need adjustment const author = [...postDoc.querySelectorAll('a[rel="category tag"]')] .map(el => el.textContent?.trim()) .filter(Boolean) .join(', ') || 'Unknown'; return { title, author, content, url }; } catch (error) { if (error instanceof Error) { console.error(`Failed to process ${url}: ${error.message}`); } else { console.error(`Failed to process ${url}:`, error); } return null; } }; // Process posts with progress logging const posts: BlogPost[] = []; let processed = 0; const total = postUrls.length; for (const url of postUrls) { processed++; try { console.log(`[${processed}/${total}] Processing post: ${url}`); const post = await processPost(url); if (post) { posts.push(post); console.log(`[${processed}/${total}] Successfully processed: ${post.title}`); } } catch (error) { console.error(`[${processed}/${total}] Failed to process ${url}:`, error instanceof Error ? error.message : error); } } console.log(`Finished processing. Success rate: ${posts.length}/${postUrls.length} (${Math.round((posts.length/postUrls.length)*100)}%)`); return posts; } async function main() { const posts = await fetchBlogPosts(); await writeFile('posts.json', JSON.stringify(posts, null, 2)); console.log(`Saved ${posts.length} posts to posts.json`); } main().catch(console.error);