feat: filter non-post URLs in blog scraper

main
brobert (aider) 3 months ago
parent 4889032011
commit 80b232f3a4

@ -43,9 +43,18 @@ async function fetchBlogPosts(): Promise<BlogPost[]> {
const html = await fetchWithRetry(BLOG_URL); const html = await fetchWithRetry(BLOG_URL);
const { document } = parseHTML(html); const { document } = parseHTML(html);
// Extract post URLs - this selector might need adjustment // Extract and filter post URLs
const postLinks = [...document.querySelectorAll('div.posttitle a')] as HTMLAnchorElement[]; const postLinks = [...document.querySelectorAll('div.posttitle a')] as HTMLAnchorElement[];
const postUrls = postLinks.map(link => link.href); const postUrls = postLinks
.map(link => link.href)
.filter(url => {
// Only include URLs that look like actual posts
const isPost = /\/\d{4}\/\d{2}\/\d{2}\//.test(url);
if (!isPost) {
console.log(`Skipping non-post URL: ${url}`);
}
return isPost;
});
// Process posts with limited concurrency and delays // Process posts with limited concurrency and delays
const processPost = async (url: string): Promise<BlogPost | null> => { const processPost = async (url: string): Promise<BlogPost | null> => {

Loading…
Cancel
Save