You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
175 lines
5.5 KiB
TypeScript
175 lines
5.5 KiB
TypeScript
import { parseHTML } from 'linkedom';
|
|
import { writeFile } from 'fs/promises';
|
|
import { setTimeout } from 'timers/promises';
|
|
|
|
interface BlogPost {
|
|
title: string;
|
|
author: string;
|
|
content: string;
|
|
date?: string;
|
|
url: string;
|
|
}
|
|
|
|
const BLOG_URL = 'https://cronicasperiodisticas.wordpress.com';
|
|
|
|
// Configure scraping behavior
|
|
const SCRAPE_CONFIG = {
|
|
delayBetweenRequests: 4000, // 2 seconds between requests
|
|
maxConcurrentRequests: 2,
|
|
userAgent: 'Mozilla/5.0 (compatible; BlogScraper/1.0; +https://github.com/your-repo)'
|
|
};
|
|
|
|
async function fetchWithRetry(url: string, retries = 3): Promise<string> {
|
|
for (let i = 0; i < retries; i++) {
|
|
try {
|
|
const response = await fetch(url, {
|
|
headers: { 'User-Agent': SCRAPE_CONFIG.userAgent }
|
|
});
|
|
|
|
if (!response.ok) throw new Error(`HTTP ${response.status}`);
|
|
return await response.text();
|
|
} catch (error) {
|
|
if (i === retries - 1) throw error;
|
|
const waitTime = 5000 * (i + 1); // Exponential backoff
|
|
console.log(`Retry ${i + 1} for ${url}, waiting ${waitTime}ms...`);
|
|
await setTimeout(waitTime);
|
|
}
|
|
}
|
|
throw new Error('Max retries reached');
|
|
}
|
|
|
|
async function discoverAllPostUrls(baseUrl: string, maxDepth = 3): Promise<string[]> {
|
|
const discoveredUrls = new Set<string>();
|
|
const queue: {url: string; depth: number}[] = [{url: baseUrl, depth: 0}];
|
|
const postUrls = new Set<string>();
|
|
|
|
console.log(`Starting URL discovery from ${baseUrl} with max depth ${maxDepth}`);
|
|
|
|
while (queue.length > 0) {
|
|
const {url, depth} = queue.shift()!;
|
|
|
|
if (depth > maxDepth) continue;
|
|
if (discoveredUrls.has(url)) continue;
|
|
|
|
console.log(`Processing URL (depth ${depth}): ${url}`);
|
|
discoveredUrls.add(url);
|
|
|
|
try {
|
|
const html = await fetchWithRetry(url);
|
|
const {document} = parseHTML(html);
|
|
|
|
// Check if this is a post URL
|
|
if (/\d{4}\/\d{2}\/\d{2}\//.test(url)) {
|
|
postUrls.add(url);
|
|
console.log(`Found post URL: ${url}`);
|
|
}
|
|
|
|
// Find all links on page
|
|
const links = [...document.querySelectorAll('a[href]')] as HTMLAnchorElement[];
|
|
for (const link of links) {
|
|
const href = link.href;
|
|
if (!href || href.startsWith('#')) continue;
|
|
|
|
// Normalize URL and check if it belongs to our domain
|
|
const urlObj = new URL(href, baseUrl);
|
|
if (urlObj.hostname === new URL(baseUrl).hostname) {
|
|
const normalizedUrl = urlObj.toString();
|
|
if (!discoveredUrls.has(normalizedUrl)) {
|
|
queue.push({url: normalizedUrl, depth: depth + 1});
|
|
}
|
|
}
|
|
}
|
|
|
|
// Check for pagination links
|
|
const nextPageLink = document.querySelector('a.next') as HTMLAnchorElement;
|
|
if (nextPageLink?.href) {
|
|
const nextUrl = new URL(nextPageLink.href, baseUrl).toString();
|
|
if (!discoveredUrls.has(nextUrl)) {
|
|
queue.push({url: nextUrl, depth: depth + 1});
|
|
}
|
|
}
|
|
|
|
await setTimeout(SCRAPE_CONFIG.delayBetweenRequests);
|
|
} catch (error) {
|
|
console.error(`Error processing ${url}:`, error instanceof Error ? error.message : error);
|
|
}
|
|
}
|
|
|
|
console.log(`Discovered ${postUrls.size} potential post URLs`);
|
|
return Array.from(postUrls);
|
|
}
|
|
|
|
async function fetchBlogPosts(): Promise<BlogPost[]> {
|
|
console.log(`Starting blog post scraping from ${BLOG_URL}...`);
|
|
const postUrls = await discoverAllPostUrls(BLOG_URL);
|
|
|
|
if (postUrls.length === 0) {
|
|
console.warn('No post URLs discovered!');
|
|
return [];
|
|
}
|
|
|
|
// Process posts with limited concurrency and delays
|
|
const processPost = async (url: string): Promise<BlogPost | null> => {
|
|
await setTimeout(SCRAPE_CONFIG.delayBetweenRequests);
|
|
console.log(`Processing post: ${url}`);
|
|
|
|
try {
|
|
const postHtml = await fetchWithRetry(url);
|
|
const { document: postDoc } = parseHTML(postHtml);
|
|
|
|
const title = postDoc.querySelector('h2.pagetitle')?.textContent?.trim() || 'Untitled';
|
|
const content = postDoc.querySelector('div.entry')?.innerHTML || '';
|
|
|
|
// Extract author from categories - this might need adjustment
|
|
const author = [...postDoc.querySelectorAll('a[rel="category tag"]')]
|
|
.map(el => el.textContent?.trim())
|
|
.filter(Boolean)
|
|
.join(', ') || 'Unknown';
|
|
|
|
return {
|
|
title,
|
|
author,
|
|
content,
|
|
url
|
|
};
|
|
} catch (error) {
|
|
if (error instanceof Error) {
|
|
console.error(`Failed to process ${url}: ${error.message}`);
|
|
} else {
|
|
console.error(`Failed to process ${url}:`, error);
|
|
}
|
|
return null;
|
|
}
|
|
};
|
|
|
|
// Process posts with progress logging
|
|
const posts: BlogPost[] = [];
|
|
let processed = 0;
|
|
const total = postUrls.length;
|
|
|
|
for (const url of postUrls) {
|
|
processed++;
|
|
try {
|
|
console.log(`[${processed}/${total}] Processing post: ${url}`);
|
|
const post = await processPost(url);
|
|
if (post) {
|
|
posts.push(post);
|
|
console.log(`[${processed}/${total}] Successfully processed: ${post.title}`);
|
|
}
|
|
} catch (error) {
|
|
console.error(`[${processed}/${total}] Failed to process ${url}:`, error instanceof Error ? error.message : error);
|
|
}
|
|
}
|
|
|
|
console.log(`Finished processing. Success rate: ${posts.length}/${postUrls.length} (${Math.round((posts.length/postUrls.length)*100)}%)`);
|
|
return posts;
|
|
}
|
|
|
|
async function main() {
|
|
const posts = await fetchBlogPosts();
|
|
await writeFile('posts.json', JSON.stringify(posts, null, 2));
|
|
console.log(`Saved ${posts.length} posts to posts.json`);
|
|
}
|
|
|
|
main().catch(console.error);
|