diff --git a/scrape-blog.ts b/scrape-blog.ts index 29b1784..4161dc2 100644 --- a/scrape-blog.ts +++ b/scrape-blog.ts @@ -1,5 +1,6 @@ import { parseHTML } from 'linkedom'; import { writeFile } from 'fs/promises'; +import { setTimeout } from 'timers/promises'; interface BlogPost { title: string; @@ -11,10 +12,35 @@ interface BlogPost { const BLOG_URL = 'https://cronicasperiodisticas.wordpress.com'; +// Configure scraping behavior +const SCRAPE_CONFIG = { + delayBetweenRequests: 2000, // 2 seconds between requests + maxConcurrentRequests: 2, + userAgent: 'Mozilla/5.0 (compatible; BlogScraper/1.0; +https://github.com/your-repo)' +}; + +async function fetchWithRetry(url: string, retries = 3): Promise { + for (let i = 0; i < retries; i++) { + try { + const response = await fetch(url, { + headers: { 'User-Agent': SCRAPE_CONFIG.userAgent } + }); + + if (!response.ok) throw new Error(`HTTP ${response.status}`); + return await response.text(); + } catch (error) { + if (i === retries - 1) throw error; + const waitTime = 5000 * (i + 1); // Exponential backoff + console.log(`Retry ${i + 1} for ${url}, waiting ${waitTime}ms...`); + await setTimeout(waitTime); + } + } + throw new Error('Max retries reached'); +} + async function fetchBlogPosts(): Promise { console.log(`Fetching blog posts from ${BLOG_URL}...`); - const response = await fetch(BLOG_URL); - const html = await response.text(); + const html = await fetchWithRetry(BLOG_URL); const { document } = parseHTML(html); // Extract post URLs - this selector might need adjustment @@ -23,11 +49,13 @@ async function fetchBlogPosts(): Promise { const posts: BlogPost[] = []; - for (const url of postUrls) { + // Process posts with limited concurrency and delays + const processPost = async (url: string): Promise => { + await setTimeout(SCRAPE_CONFIG.delayBetweenRequests); console.log(`Processing post: ${url}`); + try { - const postResponse = await fetch(url); - const postHtml = await postResponse.text(); + const postHtml = await fetchWithRetry(url); const { document: postDoc } = parseHTML(postHtml); const title = postDoc.querySelector('h2.pagetitle')?.textContent?.trim() || 'Untitled'; @@ -39,17 +67,28 @@ async function fetchBlogPosts(): Promise { .filter(Boolean) .join(', ') || 'Unknown'; - posts.push({ + return { title, author, content, url - }); + }; } catch (error) { console.error(`Failed to process ${url}:`, error); + return null; } - } + }; + + // Process posts with limited concurrency + const results = await Promise.allSettled( + postUrls.slice(0, 10).map(processPost) // Limit to 10 posts for initial testing + ); + + const posts = results + .filter(result => result.status === 'fulfilled' && result.value !== null) + .map(result => (result as PromiseFulfilledResult).value); + console.log(`Successfully processed ${posts.length}/${postUrls.length} posts`); return posts; }