|
|
@ -62,17 +62,40 @@ async function discoverAllPostUrls(baseUrl: string, maxDepth = 3): Promise<strin
|
|
|
|
const html = await fetchWithRetry(url);
|
|
|
|
const html = await fetchWithRetry(url);
|
|
|
|
const {document} = parseHTML(html);
|
|
|
|
const {document} = parseHTML(html);
|
|
|
|
|
|
|
|
|
|
|
|
// Check if this is a post URL (must match both domain and date pattern)
|
|
|
|
// Check if this is a post URL
|
|
|
|
const isOurDomain = new URL(url).hostname === new URL(baseUrl).hostname;
|
|
|
|
const isOurDomain = new URL(url).hostname === new URL(baseUrl).hostname;
|
|
|
|
const isPostUrl = /\d{4}\/\d{2}\/\d{2}\//.test(url);
|
|
|
|
if (!isOurDomain) {
|
|
|
|
|
|
|
|
console.log(`🚫 Skipping external link: ${url}`);
|
|
|
|
if (isOurDomain && isPostUrl) {
|
|
|
|
continue;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Multiple ways to identify post URLs
|
|
|
|
|
|
|
|
const path = new URL(url).pathname;
|
|
|
|
|
|
|
|
const isPostUrl =
|
|
|
|
|
|
|
|
// Date-based pattern (YYYY/MM/DD)
|
|
|
|
|
|
|
|
/\d{4}\/\d{2}\/\d{2}\//.test(path) ||
|
|
|
|
|
|
|
|
// Year/month pattern (YYYY/MM)
|
|
|
|
|
|
|
|
/\/\d{4}\/\d{2}\/$/.test(path) ||
|
|
|
|
|
|
|
|
// Contains 'article' or 'post' in path
|
|
|
|
|
|
|
|
/\/article\/|\/post\//.test(path) ||
|
|
|
|
|
|
|
|
// Has post-like HTML structure
|
|
|
|
|
|
|
|
document.querySelector('article.post, div.post, div.entry-content');
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (isPostUrl) {
|
|
|
|
postUrls.add(url);
|
|
|
|
postUrls.add(url);
|
|
|
|
console.log(`✅ Found post URL: ${url}`);
|
|
|
|
console.log(`✅ Found post URL: ${url}`);
|
|
|
|
} else if (isOurDomain) {
|
|
|
|
// Debug why it was identified as post
|
|
|
|
console.log(`🔍 Found internal link: ${url}`);
|
|
|
|
if (/\d{4}\/\d{2}\/\d{2}\//.test(path)) console.log(' - Matched date pattern');
|
|
|
|
|
|
|
|
else if (/\/\d{4}\/\d{2}\/$/.test(path)) console.log(' - Matched year/month pattern');
|
|
|
|
|
|
|
|
else if (/\/article\/|\/post\//.test(path)) console.log(' - Matched article/post path');
|
|
|
|
|
|
|
|
else if (document.querySelector('article.post, div.post, div.entry-content')) {
|
|
|
|
|
|
|
|
console.log(' - Matched post content structure');
|
|
|
|
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
} else {
|
|
|
|
console.log(`🚫 Skipping external link: ${url}`);
|
|
|
|
console.log(`🔍 Found internal link: ${url}`);
|
|
|
|
|
|
|
|
// Debug why it's not considered a post
|
|
|
|
|
|
|
|
if (!path.match(/\d{4}\//)) console.log(' - No date pattern in URL');
|
|
|
|
|
|
|
|
if (!document.querySelector('article.post, div.post')) console.log(' - No post content structure found');
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Find and filter links on page
|
|
|
|
// Find and filter links on page
|
|
|
|