You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

230 lines
7.6 KiB
TypeScript

import { parseHTML } from 'linkedom';
import { writeFile } from 'fs/promises';
import { setTimeout } from 'timers/promises';
interface BlogPost {
title: string;
author: string;
content: string;
date?: string;
url: string;
}
const BLOG_URL = 'https://cronicasperiodisticas.wordpress.com';
// Configure scraping behavior
const SCRAPE_CONFIG = {
delayBetweenRequests: 4000, // 2 seconds between requests
maxConcurrentRequests: 2,
userAgent: 'Mozilla/5.0 (compatible; BlogScraper/1.0; +https://github.com/your-repo)'
};
async function fetchWithRetry(url: string, retries = 3): Promise<string> {
for (let i = 0; i < retries; i++) {
try {
const response = await fetch(url, {
headers: { 'User-Agent': SCRAPE_CONFIG.userAgent }
});
if (!response.ok) throw new Error(`HTTP ${response.status}`);
return await response.text();
} catch (error) {
if (i === retries - 1) throw error;
const waitTime = 5000 * (i + 1); // Exponential backoff
console.log(`Retry ${i + 1} for ${url}, waiting ${waitTime}ms...`);
await setTimeout(waitTime);
}
}
throw new Error('Max retries reached');
}
async function discoverAllPostUrls(baseUrl: string, maxDepth = 3): Promise<string[]> {
const discoveredUrls = new Set<string>();
const queue: {url: string; depth: number}[] = [{url: baseUrl, depth: 0}];
const postUrls = new Set<string>();
console.log(`Starting URL discovery from ${baseUrl} with max depth ${maxDepth}`);
while (queue.length > 0) {
const {url, depth} = queue.shift()!;
if (depth > maxDepth) continue;
if (discoveredUrls.has(url)) continue;
console.log(`\nProcessing URL (depth ${depth})`);
console.log(`Queue size: ${queue.length}`);
console.log(`Discovered URLs: ${discoveredUrls.size}`);
console.log(`Post URLs found: ${postUrls.size}`);
console.log(`Current URL: ${url}`);
discoveredUrls.add(url);
try {
const html = await fetchWithRetry(url);
const {document} = parseHTML(html);
// Check if this is a post URL
const isOurDomain = new URL(url).hostname === new URL(baseUrl).hostname;
if (!isOurDomain) {
console.log(`🚫 Skipping external link: ${url}`);
continue;
}
// Strict check for post URL pattern
const path = new URL(url).pathname;
const isPostUrl = /^\/\d{4}\/\d{2}\/\d{2}\/[^/]+\/$/.test(path);
if (isPostUrl) {
postUrls.add(url);
console.log(`✅ Found post URL: ${url}`);
} else {
// Log what type of internal link we found
if (path.startsWith('/tag/')) {
console.log(`🏷️ Found tag page: ${url}`);
} else if (path.startsWith('/category/')) {
console.log(`🗂️ Found category page: ${url}`);
} else if (path.startsWith('/page/')) {
console.log(`📄 Found pagination page: ${url}`);
} else {
console.log(`🔍 Found internal link: ${url}`);
}
}
// Find and filter links on page
const links = [...document.querySelectorAll('a[href]')] as HTMLAnchorElement[];
for (const link of links) {
const href = link.href;
if (!href || href.startsWith('#')) continue;
try {
// Normalize URL - remove trailing slashes and query params
const urlObj = new URL(href, baseUrl);
if (urlObj.hostname !== new URL(baseUrl).hostname) continue;
// Clean up URL
urlObj.hash = '';
urlObj.search = '';
let normalizedUrl = urlObj.toString().replace(/\/$/, '');
// Skip if already discovered or in queue
if (discoveredUrls.has(normalizedUrl) ||
queue.some(item => item.url === normalizedUrl)) {
continue;
}
// Add pagination, tag, category and post URLs to queue
const path = urlObj.pathname;
if (path.startsWith('/page/') || // Pagination
path.startsWith('/tag/') || // Tag pages
path.startsWith('/category/') || // Category pages
/^\/\d{4}\/\d{2}\/\d{2}\/[^/]+\/$/.test(path)) { // Post URLs
queue.push({url: normalizedUrl, depth: depth + 1});
}
} catch (error) {
console.log(`Skipping invalid URL: ${href}`);
}
}
// Check for pagination links (more strict matching)
const paginationLinks = [...document.querySelectorAll('a.page-numbers, a.next, a.prev')] as HTMLAnchorElement[];
for (const link of paginationLinks) {
try {
const urlObj = new URL(link.href, baseUrl);
if (urlObj.hostname !== new URL(baseUrl).hostname) continue;
urlObj.hash = '';
urlObj.search = '';
const normalizedUrl = urlObj.toString().replace(/\/$/, '');
if (!discoveredUrls.has(normalizedUrl) &&
!queue.some(item => item.url === normalizedUrl)) {
queue.push({url: normalizedUrl, depth: depth + 1});
}
} catch (error) {
console.log(`Skipping pagination link: ${link.href}`);
}
}
await setTimeout(SCRAPE_CONFIG.delayBetweenRequests);
} catch (error) {
console.error(`Error processing ${url}:`, error instanceof Error ? error.message : error);
}
}
console.log(`Discovered ${postUrls.size} potential post URLs`);
return Array.from(postUrls);
}
async function fetchBlogPosts(): Promise<BlogPost[]> {
console.log(`Starting blog post scraping from ${BLOG_URL}...`);
const postUrls = await discoverAllPostUrls(BLOG_URL);
if (postUrls.length === 0) {
console.warn('No post URLs discovered!');
return [];
}
// Process posts with limited concurrency and delays
const processPost = async (url: string): Promise<BlogPost | null> => {
await setTimeout(SCRAPE_CONFIG.delayBetweenRequests);
console.log(`Processing post: ${url}`);
try {
const postHtml = await fetchWithRetry(url);
const { document: postDoc } = parseHTML(postHtml);
const title = postDoc.querySelector('h2.pagetitle')?.textContent?.trim() || 'Untitled';
const content = postDoc.querySelector('div.entry')?.innerHTML || '';
// Extract author from categories - this might need adjustment
const author = [...postDoc.querySelectorAll('a[rel="category tag"]')]
.map(el => el.textContent?.trim())
.filter(Boolean)
.join(', ') || 'Unknown';
return {
title,
author,
content,
url
};
} catch (error) {
if (error instanceof Error) {
console.error(`Failed to process ${url}: ${error.message}`);
} else {
console.error(`Failed to process ${url}:`, error);
}
return null;
}
};
// Process posts with progress logging
const posts: BlogPost[] = [];
let processed = 0;
const total = postUrls.length;
for (const url of postUrls) {
processed++;
try {
console.log(`[${processed}/${total}] Processing post: ${url}`);
const post = await processPost(url);
if (post) {
posts.push(post);
console.log(`[${processed}/${total}] Successfully processed: ${post.title}`);
}
} catch (error) {
console.error(`[${processed}/${total}] Failed to process ${url}:`, error instanceof Error ? error.message : error);
}
}
console.log(`Finished processing. Success rate: ${posts.length}/${postUrls.length} (${Math.round((posts.length/postUrls.length)*100)}%)`);
return posts;
}
async function main() {
const posts = await fetchBlogPosts();
await writeFile('posts.json', JSON.stringify(posts, null, 2));
console.log(`Saved ${posts.length} posts to posts.json`);
}
main().catch(console.error);