From 30e58286ebb28714a8ec1a99656dcaff5373354d Mon Sep 17 00:00:00 2001 From: "brobert (aider)" Date: Tue, 1 Apr 2025 15:31:08 +0200 Subject: [PATCH] feat: add recursive URL discovery with improved logging --- scrape-blog.ts | 109 ++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 86 insertions(+), 23 deletions(-) diff --git a/scrape-blog.ts b/scrape-blog.ts index 830710b..1bc779d 100644 --- a/scrape-blog.ts +++ b/scrape-blog.ts @@ -38,23 +38,75 @@ async function fetchWithRetry(url: string, retries = 3): Promise { throw new Error('Max retries reached'); } -async function fetchBlogPosts(): Promise { - console.log(`Fetching blog posts from ${BLOG_URL}...`); - const html = await fetchWithRetry(BLOG_URL); - const { document } = parseHTML(html); - - // Extract and filter post URLs - const postLinks = [...document.querySelectorAll('div.posttitle a')] as HTMLAnchorElement[]; - const postUrls = postLinks - .map(link => link.href) - .filter(url => { - // Only include URLs that look like actual posts - const isPost = /\/\d{4}\/\d{2}\/\d{2}\//.test(url); - if (!isPost) { - console.log(`Skipping non-post URL: ${url}`); +async function discoverAllPostUrls(baseUrl: string, maxDepth = 3): Promise { + const discoveredUrls = new Set(); + const queue: {url: string; depth: number}[] = [{url: baseUrl, depth: 0}]; + const postUrls = new Set(); + + console.log(`Starting URL discovery from ${baseUrl} with max depth ${maxDepth}`); + + while (queue.length > 0) { + const {url, depth} = queue.shift()!; + + if (depth > maxDepth) continue; + if (discoveredUrls.has(url)) continue; + + console.log(`Processing URL (depth ${depth}): ${url}`); + discoveredUrls.add(url); + + try { + const html = await fetchWithRetry(url); + const {document} = parseHTML(html); + + // Check if this is a post URL + if (/\d{4}\/\d{2}\/\d{2}\//.test(url)) { + postUrls.add(url); + console.log(`Found post URL: ${url}`); + } + + // Find all links on page + const links = [...document.querySelectorAll('a[href]')] as HTMLAnchorElement[]; + for (const link of links) { + const href = link.href; + if (!href || href.startsWith('#')) continue; + + // Normalize URL and check if it belongs to our domain + const urlObj = new URL(href, baseUrl); + if (urlObj.hostname === new URL(baseUrl).hostname) { + const normalizedUrl = urlObj.toString(); + if (!discoveredUrls.has(normalizedUrl)) { + queue.push({url: normalizedUrl, depth: depth + 1}); + } + } } - return isPost; - }); + + // Check for pagination links + const nextPageLink = document.querySelector('a.next') as HTMLAnchorElement; + if (nextPageLink?.href) { + const nextUrl = new URL(nextPageLink.href, baseUrl).toString(); + if (!discoveredUrls.has(nextUrl)) { + queue.push({url: nextUrl, depth: depth + 1}); + } + } + + await setTimeout(SCRAPE_CONFIG.delayBetweenRequests); + } catch (error) { + console.error(`Error processing ${url}:`, error instanceof Error ? error.message : error); + } + } + + console.log(`Discovered ${postUrls.size} potential post URLs`); + return Array.from(postUrls); +} + +async function fetchBlogPosts(): Promise { + console.log(`Starting blog post scraping from ${BLOG_URL}...`); + const postUrls = await discoverAllPostUrls(BLOG_URL); + + if (postUrls.length === 0) { + console.warn('No post URLs discovered!'); + return []; + } // Process posts with limited concurrency and delays const processPost = async (url: string): Promise => { @@ -90,15 +142,26 @@ async function fetchBlogPosts(): Promise { } }; - // Process posts sequentially with delay - const results = []; - for (const url of postUrls) { // Process all posts - results.push(await processPost(url)); - } + // Process posts with progress logging + const posts: BlogPost[] = []; + let processed = 0; + const total = postUrls.length; - const posts: BlogPost[] = results.filter((post): post is BlogPost => post !== null); + for (const url of postUrls) { + processed++; + try { + console.log(`[${processed}/${total}] Processing post: ${url}`); + const post = await processPost(url); + if (post) { + posts.push(post); + console.log(`[${processed}/${total}] Successfully processed: ${post.title}`); + } + } catch (error) { + console.error(`[${processed}/${total}] Failed to process ${url}:`, error instanceof Error ? error.message : error); + } + } - console.log(`Successfully processed ${posts.length}/${postUrls.length} posts`); + console.log(`Finished processing. Success rate: ${posts.length}/${postUrls.length} (${Math.round((posts.length/postUrls.length)*100)}%)`); return posts; }