|
|
@ -51,17 +51,28 @@ async function discoverAllPostUrls(baseUrl: string, maxDepth = 3): Promise<strin
|
|
|
|
if (depth > maxDepth) continue;
|
|
|
|
if (depth > maxDepth) continue;
|
|
|
|
if (discoveredUrls.has(url)) continue;
|
|
|
|
if (discoveredUrls.has(url)) continue;
|
|
|
|
|
|
|
|
|
|
|
|
console.log(`Processing URL (depth ${depth}): ${url}`);
|
|
|
|
console.log(`\nProcessing URL (depth ${depth})`);
|
|
|
|
|
|
|
|
console.log(`Queue size: ${queue.length}`);
|
|
|
|
|
|
|
|
console.log(`Discovered URLs: ${discoveredUrls.size}`);
|
|
|
|
|
|
|
|
console.log(`Post URLs found: ${postUrls.size}`);
|
|
|
|
|
|
|
|
console.log(`Current URL: ${url}`);
|
|
|
|
discoveredUrls.add(url);
|
|
|
|
discoveredUrls.add(url);
|
|
|
|
|
|
|
|
|
|
|
|
try {
|
|
|
|
try {
|
|
|
|
const html = await fetchWithRetry(url);
|
|
|
|
const html = await fetchWithRetry(url);
|
|
|
|
const {document} = parseHTML(html);
|
|
|
|
const {document} = parseHTML(html);
|
|
|
|
|
|
|
|
|
|
|
|
// Check if this is a post URL
|
|
|
|
// Check if this is a post URL (must match both domain and date pattern)
|
|
|
|
if (/\d{4}\/\d{2}\/\d{2}\//.test(url)) {
|
|
|
|
const isOurDomain = new URL(url).hostname === new URL(baseUrl).hostname;
|
|
|
|
|
|
|
|
const isPostUrl = /\d{4}\/\d{2}\/\d{2}\//.test(url);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (isOurDomain && isPostUrl) {
|
|
|
|
postUrls.add(url);
|
|
|
|
postUrls.add(url);
|
|
|
|
console.log(`Found post URL: ${url}`);
|
|
|
|
console.log(`✅ Found post URL: ${url}`);
|
|
|
|
|
|
|
|
} else if (isOurDomain) {
|
|
|
|
|
|
|
|
console.log(`🔍 Found internal link: ${url}`);
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
console.log(`🚫 Skipping external link: ${url}`);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Find all links on page
|
|
|
|
// Find all links on page
|
|
|
|