|
|
@ -69,33 +69,15 @@ async function discoverAllPostUrls(baseUrl: string, maxDepth = 3): Promise<strin
|
|
|
|
continue;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Multiple ways to identify post URLs
|
|
|
|
// Strict check for post URL pattern
|
|
|
|
const path = new URL(url).pathname;
|
|
|
|
const path = new URL(url).pathname;
|
|
|
|
const isPostUrl =
|
|
|
|
const isPostUrl = /^\/\d{4}\/\d{2}\/\d{2}\/[^/]+\/$/.test(path);
|
|
|
|
// Date-based pattern (YYYY/MM/DD)
|
|
|
|
|
|
|
|
/\d{4}\/\d{2}\/\d{2}\//.test(path) ||
|
|
|
|
|
|
|
|
// Year/month pattern (YYYY/MM)
|
|
|
|
|
|
|
|
/\/\d{4}\/\d{2}\/$/.test(path) ||
|
|
|
|
|
|
|
|
// Contains 'article' or 'post' in path
|
|
|
|
|
|
|
|
/\/article\/|\/post\//.test(path) ||
|
|
|
|
|
|
|
|
// Has post-like HTML structure
|
|
|
|
|
|
|
|
document.querySelector('article.post, div.post, div.entry-content');
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (isPostUrl) {
|
|
|
|
if (isPostUrl) {
|
|
|
|
postUrls.add(url);
|
|
|
|
postUrls.add(url);
|
|
|
|
console.log(`✅ Found post URL: ${url}`);
|
|
|
|
console.log(`✅ Found post URL: ${url}`);
|
|
|
|
// Debug why it was identified as post
|
|
|
|
|
|
|
|
if (/\d{4}\/\d{2}\/\d{2}\//.test(path)) console.log(' - Matched date pattern');
|
|
|
|
|
|
|
|
else if (/\/\d{4}\/\d{2}\/$/.test(path)) console.log(' - Matched year/month pattern');
|
|
|
|
|
|
|
|
else if (/\/article\/|\/post\//.test(path)) console.log(' - Matched article/post path');
|
|
|
|
|
|
|
|
else if (document.querySelector('article.post, div.post, div.entry-content')) {
|
|
|
|
|
|
|
|
console.log(' - Matched post content structure');
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
} else {
|
|
|
|
} else {
|
|
|
|
console.log(`🔍 Found internal link: ${url}`);
|
|
|
|
console.log(`🔍 Found internal link: ${url}`);
|
|
|
|
// Debug why it's not considered a post
|
|
|
|
|
|
|
|
if (!path.match(/\d{4}\//)) console.log(' - No date pattern in URL');
|
|
|
|
|
|
|
|
if (!document.querySelector('article.post, div.post')) console.log(' - No post content structure found');
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Find and filter links on page
|
|
|
|
// Find and filter links on page
|
|
|
@ -120,14 +102,10 @@ async function discoverAllPostUrls(baseUrl: string, maxDepth = 3): Promise<strin
|
|
|
|
continue;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Only add certain types of URLs to queue
|
|
|
|
// Only add pagination links and post URLs to queue
|
|
|
|
const path = urlObj.pathname;
|
|
|
|
const path = urlObj.pathname;
|
|
|
|
if (path.startsWith('/tag/') ||
|
|
|
|
if (path.startsWith('/page/') || // Pagination
|
|
|
|
path.startsWith('/category/') ||
|
|
|
|
/^\/\d{4}\/\d{2}\/\d{2}\/[^/]+\/$/.test(path)) { // Post URLs
|
|
|
|
path.startsWith('/page/') ||
|
|
|
|
|
|
|
|
path.startsWith('/author/') ||
|
|
|
|
|
|
|
|
path === '/' ||
|
|
|
|
|
|
|
|
/\d{4}\/\d{2}\/\d{2}/.test(path)) {
|
|
|
|
|
|
|
|
queue.push({url: normalizedUrl, depth: depth + 1});
|
|
|
|
queue.push({url: normalizedUrl, depth: depth + 1});
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} catch (error) {
|
|
|
|
} catch (error) {
|
|
|
|