feat: add recursive URL discovery with improved logging

main
brobert (aider) 3 months ago
parent b43f7db8bb
commit 30e58286eb

@ -38,23 +38,75 @@ async function fetchWithRetry(url: string, retries = 3): Promise<string> {
throw new Error('Max retries reached');
}
async function discoverAllPostUrls(baseUrl: string, maxDepth = 3): Promise<string[]> {
const discoveredUrls = new Set<string>();
const queue: {url: string; depth: number}[] = [{url: baseUrl, depth: 0}];
const postUrls = new Set<string>();
console.log(`Starting URL discovery from ${baseUrl} with max depth ${maxDepth}`);
while (queue.length > 0) {
const {url, depth} = queue.shift()!;
if (depth > maxDepth) continue;
if (discoveredUrls.has(url)) continue;
console.log(`Processing URL (depth ${depth}): ${url}`);
discoveredUrls.add(url);
try {
const html = await fetchWithRetry(url);
const {document} = parseHTML(html);
// Check if this is a post URL
if (/\d{4}\/\d{2}\/\d{2}\//.test(url)) {
postUrls.add(url);
console.log(`Found post URL: ${url}`);
}
// Find all links on page
const links = [...document.querySelectorAll('a[href]')] as HTMLAnchorElement[];
for (const link of links) {
const href = link.href;
if (!href || href.startsWith('#')) continue;
// Normalize URL and check if it belongs to our domain
const urlObj = new URL(href, baseUrl);
if (urlObj.hostname === new URL(baseUrl).hostname) {
const normalizedUrl = urlObj.toString();
if (!discoveredUrls.has(normalizedUrl)) {
queue.push({url: normalizedUrl, depth: depth + 1});
}
}
}
// Check for pagination links
const nextPageLink = document.querySelector('a.next') as HTMLAnchorElement;
if (nextPageLink?.href) {
const nextUrl = new URL(nextPageLink.href, baseUrl).toString();
if (!discoveredUrls.has(nextUrl)) {
queue.push({url: nextUrl, depth: depth + 1});
}
}
await setTimeout(SCRAPE_CONFIG.delayBetweenRequests);
} catch (error) {
console.error(`Error processing ${url}:`, error instanceof Error ? error.message : error);
}
}
console.log(`Discovered ${postUrls.size} potential post URLs`);
return Array.from(postUrls);
}
async function fetchBlogPosts(): Promise<BlogPost[]> {
console.log(`Fetching blog posts from ${BLOG_URL}...`);
const html = await fetchWithRetry(BLOG_URL);
const { document } = parseHTML(html);
// Extract and filter post URLs
const postLinks = [...document.querySelectorAll('div.posttitle a')] as HTMLAnchorElement[];
const postUrls = postLinks
.map(link => link.href)
.filter(url => {
// Only include URLs that look like actual posts
const isPost = /\/\d{4}\/\d{2}\/\d{2}\//.test(url);
if (!isPost) {
console.log(`Skipping non-post URL: ${url}`);
}
return isPost;
});
console.log(`Starting blog post scraping from ${BLOG_URL}...`);
const postUrls = await discoverAllPostUrls(BLOG_URL);
if (postUrls.length === 0) {
console.warn('No post URLs discovered!');
return [];
}
// Process posts with limited concurrency and delays
const processPost = async (url: string): Promise<BlogPost | null> => {
@ -90,15 +142,26 @@ async function fetchBlogPosts(): Promise<BlogPost[]> {
}
};
// Process posts sequentially with delay
const results = [];
for (const url of postUrls) { // Process all posts
results.push(await processPost(url));
}
// Process posts with progress logging
const posts: BlogPost[] = [];
let processed = 0;
const total = postUrls.length;
const posts: BlogPost[] = results.filter((post): post is BlogPost => post !== null);
for (const url of postUrls) {
processed++;
try {
console.log(`[${processed}/${total}] Processing post: ${url}`);
const post = await processPost(url);
if (post) {
posts.push(post);
console.log(`[${processed}/${total}] Successfully processed: ${post.title}`);
}
} catch (error) {
console.error(`[${processed}/${total}] Failed to process ${url}:`, error instanceof Error ? error.message : error);
}
}
console.log(`Successfully processed ${posts.length}/${postUrls.length} posts`);
console.log(`Finished processing. Success rate: ${posts.length}/${postUrls.length} (${Math.round((posts.length/postUrls.length)*100)}%)`);
return posts;
}

Loading…
Cancel
Save