refactor: simplify URL pattern matching for blog posts

main
brobert (aider) 3 months ago
parent a1e8066ea4
commit 976245d84a

@ -69,33 +69,15 @@ async function discoverAllPostUrls(baseUrl: string, maxDepth = 3): Promise<strin
continue;
}
// Multiple ways to identify post URLs
// Strict check for post URL pattern
const path = new URL(url).pathname;
const isPostUrl =
// Date-based pattern (YYYY/MM/DD)
/\d{4}\/\d{2}\/\d{2}\//.test(path) ||
// Year/month pattern (YYYY/MM)
/\/\d{4}\/\d{2}\/$/.test(path) ||
// Contains 'article' or 'post' in path
/\/article\/|\/post\//.test(path) ||
// Has post-like HTML structure
document.querySelector('article.post, div.post, div.entry-content');
const isPostUrl = /^\/\d{4}\/\d{2}\/\d{2}\/[^/]+\/$/.test(path);
if (isPostUrl) {
postUrls.add(url);
console.log(`✅ Found post URL: ${url}`);
// Debug why it was identified as post
if (/\d{4}\/\d{2}\/\d{2}\//.test(path)) console.log(' - Matched date pattern');
else if (/\/\d{4}\/\d{2}\/$/.test(path)) console.log(' - Matched year/month pattern');
else if (/\/article\/|\/post\//.test(path)) console.log(' - Matched article/post path');
else if (document.querySelector('article.post, div.post, div.entry-content')) {
console.log(' - Matched post content structure');
}
} else {
console.log(`🔍 Found internal link: ${url}`);
// Debug why it's not considered a post
if (!path.match(/\d{4}\//)) console.log(' - No date pattern in URL');
if (!document.querySelector('article.post, div.post')) console.log(' - No post content structure found');
}
// Find and filter links on page
@ -120,14 +102,10 @@ async function discoverAllPostUrls(baseUrl: string, maxDepth = 3): Promise<strin
continue;
}
// Only add certain types of URLs to queue
// Only add pagination links and post URLs to queue
const path = urlObj.pathname;
if (path.startsWith('/tag/') ||
path.startsWith('/category/') ||
path.startsWith('/page/') ||
path.startsWith('/author/') ||
path === '/' ||
/\d{4}\/\d{2}\/\d{2}/.test(path)) {
if (path.startsWith('/page/') || // Pagination
/^\/\d{4}\/\d{2}\/\d{2}\/[^/]+\/$/.test(path)) { // Post URLs
queue.push({url: normalizedUrl, depth: depth + 1});
}
} catch (error) {

Loading…
Cancel
Save