You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
112 lines
3.4 KiB
TypeScript
112 lines
3.4 KiB
TypeScript
import { parseHTML } from 'linkedom';
|
|
import { writeFile } from 'fs/promises';
|
|
import { setTimeout } from 'timers/promises';
|
|
|
|
interface BlogPost {
|
|
title: string;
|
|
author: string;
|
|
content: string;
|
|
date?: string;
|
|
url: string;
|
|
}
|
|
|
|
const BLOG_URL = 'https://cronicasperiodisticas.wordpress.com';
|
|
|
|
// Configure scraping behavior
|
|
const SCRAPE_CONFIG = {
|
|
delayBetweenRequests: 2000, // 2 seconds between requests
|
|
maxConcurrentRequests: 2,
|
|
userAgent: 'Mozilla/5.0 (compatible; BlogScraper/1.0; +https://github.com/your-repo)'
|
|
};
|
|
|
|
async function fetchWithRetry(url: string, retries = 3): Promise<string> {
|
|
for (let i = 0; i < retries; i++) {
|
|
try {
|
|
const response = await fetch(url, {
|
|
headers: { 'User-Agent': SCRAPE_CONFIG.userAgent }
|
|
});
|
|
|
|
if (!response.ok) throw new Error(`HTTP ${response.status}`);
|
|
return await response.text();
|
|
} catch (error) {
|
|
if (i === retries - 1) throw error;
|
|
const waitTime = 5000 * (i + 1); // Exponential backoff
|
|
console.log(`Retry ${i + 1} for ${url}, waiting ${waitTime}ms...`);
|
|
await setTimeout(waitTime);
|
|
}
|
|
}
|
|
throw new Error('Max retries reached');
|
|
}
|
|
|
|
async function fetchBlogPosts(): Promise<BlogPost[]> {
|
|
console.log(`Fetching blog posts from ${BLOG_URL}...`);
|
|
const html = await fetchWithRetry(BLOG_URL);
|
|
const { document } = parseHTML(html);
|
|
|
|
// Extract and filter post URLs
|
|
const postLinks = [...document.querySelectorAll('div.posttitle a')] as HTMLAnchorElement[];
|
|
const postUrls = postLinks
|
|
.map(link => link.href)
|
|
.filter(url => {
|
|
// Only include URLs that look like actual posts
|
|
const isPost = /\/\d{4}\/\d{2}\/\d{2}\//.test(url);
|
|
if (!isPost) {
|
|
console.log(`Skipping non-post URL: ${url}`);
|
|
}
|
|
return isPost;
|
|
});
|
|
|
|
// Process posts with limited concurrency and delays
|
|
const processPost = async (url: string): Promise<BlogPost | null> => {
|
|
await setTimeout(SCRAPE_CONFIG.delayBetweenRequests);
|
|
console.log(`Processing post: ${url}`);
|
|
|
|
try {
|
|
const postHtml = await fetchWithRetry(url);
|
|
const { document: postDoc } = parseHTML(postHtml);
|
|
|
|
const title = postDoc.querySelector('h2.pagetitle')?.textContent?.trim() || 'Untitled';
|
|
const content = postDoc.querySelector('div.entry')?.innerHTML || '';
|
|
|
|
// Extract author from categories - this might need adjustment
|
|
const author = [...postDoc.querySelectorAll('a[rel="category tag"]')]
|
|
.map(el => el.textContent?.trim())
|
|
.filter(Boolean)
|
|
.join(', ') || 'Unknown';
|
|
|
|
return {
|
|
title,
|
|
author,
|
|
content,
|
|
url
|
|
};
|
|
} catch (error) {
|
|
if (error instanceof Error) {
|
|
console.error(`Failed to process ${url}: ${error.message}`);
|
|
} else {
|
|
console.error(`Failed to process ${url}:`, error);
|
|
}
|
|
return null;
|
|
}
|
|
};
|
|
|
|
// Process posts sequentially with delay
|
|
const results = [];
|
|
for (const url of postUrls) { // Process all posts
|
|
results.push(await processPost(url));
|
|
}
|
|
|
|
const posts: BlogPost[] = results.filter((post): post is BlogPost => post !== null);
|
|
|
|
console.log(`Successfully processed ${posts.length}/${postUrls.length} posts`);
|
|
return posts;
|
|
}
|
|
|
|
async function main() {
|
|
const posts = await fetchBlogPosts();
|
|
await writeFile('posts.json', JSON.stringify(posts, null, 2));
|
|
console.log(`Saved ${posts.length} posts to posts.json`);
|
|
}
|
|
|
|
main().catch(console.error);
|