|
|
@ -75,28 +75,60 @@ async function discoverAllPostUrls(baseUrl: string, maxDepth = 3): Promise<strin
|
|
|
|
console.log(`🚫 Skipping external link: ${url}`);
|
|
|
|
console.log(`🚫 Skipping external link: ${url}`);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Find all links on page
|
|
|
|
// Find and filter links on page
|
|
|
|
const links = [...document.querySelectorAll('a[href]')] as HTMLAnchorElement[];
|
|
|
|
const links = [...document.querySelectorAll('a[href]')] as HTMLAnchorElement[];
|
|
|
|
for (const link of links) {
|
|
|
|
for (const link of links) {
|
|
|
|
const href = link.href;
|
|
|
|
const href = link.href;
|
|
|
|
if (!href || href.startsWith('#')) continue;
|
|
|
|
if (!href || href.startsWith('#')) continue;
|
|
|
|
|
|
|
|
|
|
|
|
// Normalize URL and check if it belongs to our domain
|
|
|
|
try {
|
|
|
|
const urlObj = new URL(href, baseUrl);
|
|
|
|
// Normalize URL - remove trailing slashes and query params
|
|
|
|
if (urlObj.hostname === new URL(baseUrl).hostname) {
|
|
|
|
const urlObj = new URL(href, baseUrl);
|
|
|
|
const normalizedUrl = urlObj.toString();
|
|
|
|
if (urlObj.hostname !== new URL(baseUrl).hostname) continue;
|
|
|
|
if (!discoveredUrls.has(normalizedUrl)) {
|
|
|
|
|
|
|
|
|
|
|
|
// Clean up URL
|
|
|
|
|
|
|
|
urlObj.hash = '';
|
|
|
|
|
|
|
|
urlObj.search = '';
|
|
|
|
|
|
|
|
let normalizedUrl = urlObj.toString().replace(/\/$/, '');
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Skip if already discovered or in queue
|
|
|
|
|
|
|
|
if (discoveredUrls.has(normalizedUrl) ||
|
|
|
|
|
|
|
|
queue.some(item => item.url === normalizedUrl)) {
|
|
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Only add certain types of URLs to queue
|
|
|
|
|
|
|
|
const path = urlObj.pathname;
|
|
|
|
|
|
|
|
if (path.startsWith('/tag/') ||
|
|
|
|
|
|
|
|
path.startsWith('/category/') ||
|
|
|
|
|
|
|
|
path.startsWith('/page/') ||
|
|
|
|
|
|
|
|
path.startsWith('/author/') ||
|
|
|
|
|
|
|
|
path === '/' ||
|
|
|
|
|
|
|
|
/\d{4}\/\d{2}\/\d{2}/.test(path)) {
|
|
|
|
queue.push({url: normalizedUrl, depth: depth + 1});
|
|
|
|
queue.push({url: normalizedUrl, depth: depth + 1});
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
} catch (error) {
|
|
|
|
|
|
|
|
console.log(`Skipping invalid URL: ${href}`);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Check for pagination links
|
|
|
|
// Check for pagination links (more strict matching)
|
|
|
|
const nextPageLink = document.querySelector('a.next') as HTMLAnchorElement;
|
|
|
|
const paginationLinks = [...document.querySelectorAll('a.page-numbers, a.next, a.prev')] as HTMLAnchorElement[];
|
|
|
|
if (nextPageLink?.href) {
|
|
|
|
for (const link of paginationLinks) {
|
|
|
|
const nextUrl = new URL(nextPageLink.href, baseUrl).toString();
|
|
|
|
try {
|
|
|
|
if (!discoveredUrls.has(nextUrl)) {
|
|
|
|
const urlObj = new URL(link.href, baseUrl);
|
|
|
|
queue.push({url: nextUrl, depth: depth + 1});
|
|
|
|
if (urlObj.hostname !== new URL(baseUrl).hostname) continue;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
urlObj.hash = '';
|
|
|
|
|
|
|
|
urlObj.search = '';
|
|
|
|
|
|
|
|
const normalizedUrl = urlObj.toString().replace(/\/$/, '');
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (!discoveredUrls.has(normalizedUrl) &&
|
|
|
|
|
|
|
|
!queue.some(item => item.url === normalizedUrl)) {
|
|
|
|
|
|
|
|
queue.push({url: normalizedUrl, depth: depth + 1});
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
} catch (error) {
|
|
|
|
|
|
|
|
console.log(`Skipping pagination link: ${link.href}`);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|