You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
241 lines
8.3 KiB
TypeScript
241 lines
8.3 KiB
TypeScript
import { parseHTML } from 'linkedom';
|
|
import { writeFile } from 'fs/promises';
|
|
import { setTimeout } from 'timers/promises';
|
|
|
|
interface BlogPost {
|
|
title: string;
|
|
author: string;
|
|
content: string;
|
|
date?: string;
|
|
url: string;
|
|
}
|
|
|
|
const BLOG_URL = 'https://cronicasperiodisticas.wordpress.com';
|
|
|
|
// Configure scraping behavior
|
|
const SCRAPE_CONFIG = {
|
|
delayBetweenRequests: 4000, // 2 seconds between requests
|
|
maxConcurrentRequests: 2,
|
|
userAgent: 'Mozilla/5.0 (compatible; BlogScraper/1.0; +https://github.com/your-repo)'
|
|
};
|
|
|
|
async function fetchWithRetry(url: string, retries = 3): Promise<string> {
|
|
for (let i = 0; i < retries; i++) {
|
|
try {
|
|
const response = await fetch(url, {
|
|
headers: { 'User-Agent': SCRAPE_CONFIG.userAgent }
|
|
});
|
|
|
|
if (!response.ok) throw new Error(`HTTP ${response.status}`);
|
|
return await response.text();
|
|
} catch (error) {
|
|
if (i === retries - 1) throw error;
|
|
const waitTime = 5000 * (i + 1); // Exponential backoff
|
|
console.log(`Retry ${i + 1} for ${url}, waiting ${waitTime}ms...`);
|
|
await setTimeout(waitTime);
|
|
}
|
|
}
|
|
throw new Error('Max retries reached');
|
|
}
|
|
|
|
async function discoverAllPostUrls(baseUrl: string, maxDepth = 3): Promise<string[]> {
|
|
const discoveredUrls = new Set<string>();
|
|
const queue: {url: string; depth: number}[] = [{url: baseUrl, depth: 0}];
|
|
const postUrls = new Set<string>();
|
|
|
|
console.log(`Starting URL discovery from ${baseUrl} with max depth ${maxDepth}`);
|
|
|
|
while (queue.length > 0) {
|
|
const {url, depth} = queue.shift()!;
|
|
|
|
if (depth > maxDepth) continue;
|
|
if (discoveredUrls.has(url)) continue;
|
|
|
|
console.log(`\nProcessing URL (depth ${depth})`);
|
|
console.log(`Queue size: ${queue.length}`);
|
|
console.log(`Discovered URLs: ${discoveredUrls.size}`);
|
|
console.log(`Post URLs found: ${postUrls.size}`);
|
|
console.log(`Current URL: ${url}`);
|
|
discoveredUrls.add(url);
|
|
|
|
try {
|
|
const html = await fetchWithRetry(url);
|
|
const {document} = parseHTML(html);
|
|
|
|
// Check if this is a post URL
|
|
const isOurDomain = new URL(url).hostname === new URL(baseUrl).hostname;
|
|
if (!isOurDomain) {
|
|
console.log(`🚫 Skipping external link: ${url}`);
|
|
continue;
|
|
}
|
|
|
|
// Multiple ways to identify post URLs
|
|
const path = new URL(url).pathname;
|
|
const isPostUrl =
|
|
// Date-based pattern (YYYY/MM/DD)
|
|
/\d{4}\/\d{2}\/\d{2}\//.test(path) ||
|
|
// Year/month pattern (YYYY/MM)
|
|
/\/\d{4}\/\d{2}\/$/.test(path) ||
|
|
// Contains 'article' or 'post' in path
|
|
/\/article\/|\/post\//.test(path) ||
|
|
// Has post-like HTML structure
|
|
document.querySelector('article.post, div.post, div.entry-content');
|
|
|
|
if (isPostUrl) {
|
|
postUrls.add(url);
|
|
console.log(`✅ Found post URL: ${url}`);
|
|
// Debug why it was identified as post
|
|
if (/\d{4}\/\d{2}\/\d{2}\//.test(path)) console.log(' - Matched date pattern');
|
|
else if (/\/\d{4}\/\d{2}\/$/.test(path)) console.log(' - Matched year/month pattern');
|
|
else if (/\/article\/|\/post\//.test(path)) console.log(' - Matched article/post path');
|
|
else if (document.querySelector('article.post, div.post, div.entry-content')) {
|
|
console.log(' - Matched post content structure');
|
|
}
|
|
} else {
|
|
console.log(`🔍 Found internal link: ${url}`);
|
|
// Debug why it's not considered a post
|
|
if (!path.match(/\d{4}\//)) console.log(' - No date pattern in URL');
|
|
if (!document.querySelector('article.post, div.post')) console.log(' - No post content structure found');
|
|
}
|
|
|
|
// Find and filter links on page
|
|
const links = [...document.querySelectorAll('a[href]')] as HTMLAnchorElement[];
|
|
for (const link of links) {
|
|
const href = link.href;
|
|
if (!href || href.startsWith('#')) continue;
|
|
|
|
try {
|
|
// Normalize URL - remove trailing slashes and query params
|
|
const urlObj = new URL(href, baseUrl);
|
|
if (urlObj.hostname !== new URL(baseUrl).hostname) continue;
|
|
|
|
// Clean up URL
|
|
urlObj.hash = '';
|
|
urlObj.search = '';
|
|
let normalizedUrl = urlObj.toString().replace(/\/$/, '');
|
|
|
|
// Skip if already discovered or in queue
|
|
if (discoveredUrls.has(normalizedUrl) ||
|
|
queue.some(item => item.url === normalizedUrl)) {
|
|
continue;
|
|
}
|
|
|
|
// Only add certain types of URLs to queue
|
|
const path = urlObj.pathname;
|
|
if (path.startsWith('/tag/') ||
|
|
path.startsWith('/category/') ||
|
|
path.startsWith('/page/') ||
|
|
path.startsWith('/author/') ||
|
|
path === '/' ||
|
|
/\d{4}\/\d{2}\/\d{2}/.test(path)) {
|
|
queue.push({url: normalizedUrl, depth: depth + 1});
|
|
}
|
|
} catch (error) {
|
|
console.log(`Skipping invalid URL: ${href}`);
|
|
}
|
|
}
|
|
|
|
// Check for pagination links (more strict matching)
|
|
const paginationLinks = [...document.querySelectorAll('a.page-numbers, a.next, a.prev')] as HTMLAnchorElement[];
|
|
for (const link of paginationLinks) {
|
|
try {
|
|
const urlObj = new URL(link.href, baseUrl);
|
|
if (urlObj.hostname !== new URL(baseUrl).hostname) continue;
|
|
|
|
urlObj.hash = '';
|
|
urlObj.search = '';
|
|
const normalizedUrl = urlObj.toString().replace(/\/$/, '');
|
|
|
|
if (!discoveredUrls.has(normalizedUrl) &&
|
|
!queue.some(item => item.url === normalizedUrl)) {
|
|
queue.push({url: normalizedUrl, depth: depth + 1});
|
|
}
|
|
} catch (error) {
|
|
console.log(`Skipping pagination link: ${link.href}`);
|
|
}
|
|
}
|
|
|
|
await setTimeout(SCRAPE_CONFIG.delayBetweenRequests);
|
|
} catch (error) {
|
|
console.error(`Error processing ${url}:`, error instanceof Error ? error.message : error);
|
|
}
|
|
}
|
|
|
|
console.log(`Discovered ${postUrls.size} potential post URLs`);
|
|
return Array.from(postUrls);
|
|
}
|
|
|
|
async function fetchBlogPosts(): Promise<BlogPost[]> {
|
|
console.log(`Starting blog post scraping from ${BLOG_URL}...`);
|
|
const postUrls = await discoverAllPostUrls(BLOG_URL);
|
|
|
|
if (postUrls.length === 0) {
|
|
console.warn('No post URLs discovered!');
|
|
return [];
|
|
}
|
|
|
|
// Process posts with limited concurrency and delays
|
|
const processPost = async (url: string): Promise<BlogPost | null> => {
|
|
await setTimeout(SCRAPE_CONFIG.delayBetweenRequests);
|
|
console.log(`Processing post: ${url}`);
|
|
|
|
try {
|
|
const postHtml = await fetchWithRetry(url);
|
|
const { document: postDoc } = parseHTML(postHtml);
|
|
|
|
const title = postDoc.querySelector('h2.pagetitle')?.textContent?.trim() || 'Untitled';
|
|
const content = postDoc.querySelector('div.entry')?.innerHTML || '';
|
|
|
|
// Extract author from categories - this might need adjustment
|
|
const author = [...postDoc.querySelectorAll('a[rel="category tag"]')]
|
|
.map(el => el.textContent?.trim())
|
|
.filter(Boolean)
|
|
.join(', ') || 'Unknown';
|
|
|
|
return {
|
|
title,
|
|
author,
|
|
content,
|
|
url
|
|
};
|
|
} catch (error) {
|
|
if (error instanceof Error) {
|
|
console.error(`Failed to process ${url}: ${error.message}`);
|
|
} else {
|
|
console.error(`Failed to process ${url}:`, error);
|
|
}
|
|
return null;
|
|
}
|
|
};
|
|
|
|
// Process posts with progress logging
|
|
const posts: BlogPost[] = [];
|
|
let processed = 0;
|
|
const total = postUrls.length;
|
|
|
|
for (const url of postUrls) {
|
|
processed++;
|
|
try {
|
|
console.log(`[${processed}/${total}] Processing post: ${url}`);
|
|
const post = await processPost(url);
|
|
if (post) {
|
|
posts.push(post);
|
|
console.log(`[${processed}/${total}] Successfully processed: ${post.title}`);
|
|
}
|
|
} catch (error) {
|
|
console.error(`[${processed}/${total}] Failed to process ${url}:`, error instanceof Error ? error.message : error);
|
|
}
|
|
}
|
|
|
|
console.log(`Finished processing. Success rate: ${posts.length}/${postUrls.length} (${Math.round((posts.length/postUrls.length)*100)}%)`);
|
|
return posts;
|
|
}
|
|
|
|
async function main() {
|
|
const posts = await fetchBlogPosts();
|
|
await writeFile('posts.json', JSON.stringify(posts, null, 2));
|
|
console.log(`Saved ${posts.length} posts to posts.json`);
|
|
}
|
|
|
|
main().catch(console.error);
|