From 80b232f3a47ee0c4b610e8650bbff3ed7f2d3141 Mon Sep 17 00:00:00 2001 From: "brobert (aider)" Date: Tue, 1 Apr 2025 14:56:38 +0200 Subject: [PATCH] feat: filter non-post URLs in blog scraper --- scrape-blog.ts | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/scrape-blog.ts b/scrape-blog.ts index b36d0bb..2707a2f 100644 --- a/scrape-blog.ts +++ b/scrape-blog.ts @@ -43,9 +43,18 @@ async function fetchBlogPosts(): Promise { const html = await fetchWithRetry(BLOG_URL); const { document } = parseHTML(html); - // Extract post URLs - this selector might need adjustment + // Extract and filter post URLs const postLinks = [...document.querySelectorAll('div.posttitle a')] as HTMLAnchorElement[]; - const postUrls = postLinks.map(link => link.href); + const postUrls = postLinks + .map(link => link.href) + .filter(url => { + // Only include URLs that look like actual posts + const isPost = /\/\d{4}\/\d{2}\/\d{2}\//.test(url); + if (!isPost) { + console.log(`Skipping non-post URL: ${url}`); + } + return isPost; + }); // Process posts with limited concurrency and delays const processPost = async (url: string): Promise => {