feat: improve post URL detection with multiple criteria

main
brobert (aider) 3 months ago
parent 92a3852505
commit a1e8066ea4

@ -62,17 +62,40 @@ async function discoverAllPostUrls(baseUrl: string, maxDepth = 3): Promise<strin
const html = await fetchWithRetry(url);
const {document} = parseHTML(html);
// Check if this is a post URL (must match both domain and date pattern)
// Check if this is a post URL
const isOurDomain = new URL(url).hostname === new URL(baseUrl).hostname;
const isPostUrl = /\d{4}\/\d{2}\/\d{2}\//.test(url);
if (!isOurDomain) {
console.log(`🚫 Skipping external link: ${url}`);
continue;
}
if (isOurDomain && isPostUrl) {
// Multiple ways to identify post URLs
const path = new URL(url).pathname;
const isPostUrl =
// Date-based pattern (YYYY/MM/DD)
/\d{4}\/\d{2}\/\d{2}\//.test(path) ||
// Year/month pattern (YYYY/MM)
/\/\d{4}\/\d{2}\/$/.test(path) ||
// Contains 'article' or 'post' in path
/\/article\/|\/post\//.test(path) ||
// Has post-like HTML structure
document.querySelector('article.post, div.post, div.entry-content');
if (isPostUrl) {
postUrls.add(url);
console.log(`✅ Found post URL: ${url}`);
} else if (isOurDomain) {
console.log(`🔍 Found internal link: ${url}`);
// Debug why it was identified as post
if (/\d{4}\/\d{2}\/\d{2}\//.test(path)) console.log(' - Matched date pattern');
else if (/\/\d{4}\/\d{2}\/$/.test(path)) console.log(' - Matched year/month pattern');
else if (/\/article\/|\/post\//.test(path)) console.log(' - Matched article/post path');
else if (document.querySelector('article.post, div.post, div.entry-content')) {
console.log(' - Matched post content structure');
}
} else {
console.log(`🚫 Skipping external link: ${url}`);
console.log(`🔍 Found internal link: ${url}`);
// Debug why it's not considered a post
if (!path.match(/\d{4}\//)) console.log(' - No date pattern in URL');
if (!document.querySelector('article.post, div.post')) console.log(' - No post content structure found');
}
// Find and filter links on page

Loading…
Cancel
Save