From 80b232f3a47ee0c4b610e8650bbff3ed7f2d3141 Mon Sep 17 00:00:00 2001
From: "brobert (aider)" <borja@brobert.net>
Date: Tue, 1 Apr 2025 14:56:38 +0200
Subject: [PATCH] feat: filter non-post URLs in blog scraper

---
 scrape-blog.ts | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)
diff --git a/scrape-blog.ts b/scrape-blog.ts
index b36d0bb..2707a2f 100644
--- a/scrape-blog.ts
+++ b/scrape-blog.ts
@@ -43,9 +43,18 @@ async function fetchBlogPosts(): Promise<BlogPost[]> {
   const html = await fetchWithRetry(BLOG_URL);
   const { document } = parseHTML(html);
 
-  // Extract post URLs - this selector might need adjustment
+  // Extract and filter post URLs
   const postLinks = [...document.querySelectorAll('div.posttitle a')] as HTMLAnchorElement[];
-  const postUrls = postLinks.map(link => link.href);
+  const postUrls = postLinks
+    .map(link => link.href)
+    .filter(url => {
+      // Only include URLs that look like actual posts
+      const isPost = /\/\d{4}\/\d{2}\/\d{2}\//.test(url);
+      if (!isPost) {
+        console.log(`Skipping non-post URL: ${url}`);
+      }
+      return isPost;
+    });
 
   // Process posts with limited concurrency and delays
   const processPost = async (url: string): Promise<BlogPost | null> => {