feat: add rate limiting and error handling to blog scraper

main
brobert (aider) 3 months ago
parent abf216867f
commit a872a18077

@ -1,5 +1,6 @@
import { parseHTML } from 'linkedom'; import { parseHTML } from 'linkedom';
import { writeFile } from 'fs/promises'; import { writeFile } from 'fs/promises';
import { setTimeout } from 'timers/promises';
interface BlogPost { interface BlogPost {
title: string; title: string;
@ -11,10 +12,35 @@ interface BlogPost {
const BLOG_URL = 'https://cronicasperiodisticas.wordpress.com'; const BLOG_URL = 'https://cronicasperiodisticas.wordpress.com';
// Configure scraping behavior
const SCRAPE_CONFIG = {
delayBetweenRequests: 2000, // 2 seconds between requests
maxConcurrentRequests: 2,
userAgent: 'Mozilla/5.0 (compatible; BlogScraper/1.0; +https://github.com/your-repo)'
};
async function fetchWithRetry(url: string, retries = 3): Promise<string> {
for (let i = 0; i < retries; i++) {
try {
const response = await fetch(url, {
headers: { 'User-Agent': SCRAPE_CONFIG.userAgent }
});
if (!response.ok) throw new Error(`HTTP ${response.status}`);
return await response.text();
} catch (error) {
if (i === retries - 1) throw error;
const waitTime = 5000 * (i + 1); // Exponential backoff
console.log(`Retry ${i + 1} for ${url}, waiting ${waitTime}ms...`);
await setTimeout(waitTime);
}
}
throw new Error('Max retries reached');
}
async function fetchBlogPosts(): Promise<BlogPost[]> { async function fetchBlogPosts(): Promise<BlogPost[]> {
console.log(`Fetching blog posts from ${BLOG_URL}...`); console.log(`Fetching blog posts from ${BLOG_URL}...`);
const response = await fetch(BLOG_URL); const html = await fetchWithRetry(BLOG_URL);
const html = await response.text();
const { document } = parseHTML(html); const { document } = parseHTML(html);
// Extract post URLs - this selector might need adjustment // Extract post URLs - this selector might need adjustment
@ -23,11 +49,13 @@ async function fetchBlogPosts(): Promise<BlogPost[]> {
const posts: BlogPost[] = []; const posts: BlogPost[] = [];
for (const url of postUrls) { // Process posts with limited concurrency and delays
const processPost = async (url: string): Promise<BlogPost | null> => {
await setTimeout(SCRAPE_CONFIG.delayBetweenRequests);
console.log(`Processing post: ${url}`); console.log(`Processing post: ${url}`);
try { try {
const postResponse = await fetch(url); const postHtml = await fetchWithRetry(url);
const postHtml = await postResponse.text();
const { document: postDoc } = parseHTML(postHtml); const { document: postDoc } = parseHTML(postHtml);
const title = postDoc.querySelector('h2.pagetitle')?.textContent?.trim() || 'Untitled'; const title = postDoc.querySelector('h2.pagetitle')?.textContent?.trim() || 'Untitled';
@ -39,17 +67,28 @@ async function fetchBlogPosts(): Promise<BlogPost[]> {
.filter(Boolean) .filter(Boolean)
.join(', ') || 'Unknown'; .join(', ') || 'Unknown';
posts.push({ return {
title, title,
author, author,
content, content,
url url
}); };
} catch (error) { } catch (error) {
console.error(`Failed to process ${url}:`, error); console.error(`Failed to process ${url}:`, error);
return null;
} }
} };
// Process posts with limited concurrency
const results = await Promise.allSettled(
postUrls.slice(0, 10).map(processPost) // Limit to 10 posts for initial testing
);
const posts = results
.filter(result => result.status === 'fulfilled' && result.value !== null)
.map(result => (result as PromiseFulfilledResult<BlogPost>).value);
console.log(`Successfully processed ${posts.length}/${postUrls.length} posts`);
return posts; return posts;
} }

Loading…
Cancel
Save