diff --git a/scrape-blog.ts b/scrape-blog.ts index b36d0bb..2707a2f 100644 --- a/scrape-blog.ts +++ b/scrape-blog.ts @@ -43,9 +43,18 @@ async function fetchBlogPosts(): Promise { const html = await fetchWithRetry(BLOG_URL); const { document } = parseHTML(html); - // Extract post URLs - this selector might need adjustment + // Extract and filter post URLs const postLinks = [...document.querySelectorAll('div.posttitle a')] as HTMLAnchorElement[]; - const postUrls = postLinks.map(link => link.href); + const postUrls = postLinks + .map(link => link.href) + .filter(url => { + // Only include URLs that look like actual posts + const isPost = /\/\d{4}\/\d{2}\/\d{2}\//.test(url); + if (!isPost) { + console.log(`Skipping non-post URL: ${url}`); + } + return isPost; + }); // Process posts with limited concurrency and delays const processPost = async (url: string): Promise => {