fix: improve URL deduplication and filtering in blog scraper

main
brobert (aider) 3 months ago
parent 53b0551942
commit 92a3852505

@ -75,28 +75,60 @@ async function discoverAllPostUrls(baseUrl: string, maxDepth = 3): Promise<strin
console.log(`🚫 Skipping external link: ${url}`);
}
// Find all links on page
// Find and filter links on page
const links = [...document.querySelectorAll('a[href]')] as HTMLAnchorElement[];
for (const link of links) {
const href = link.href;
if (!href || href.startsWith('#')) continue;
// Normalize URL and check if it belongs to our domain
const urlObj = new URL(href, baseUrl);
if (urlObj.hostname === new URL(baseUrl).hostname) {
const normalizedUrl = urlObj.toString();
if (!discoveredUrls.has(normalizedUrl)) {
try {
// Normalize URL - remove trailing slashes and query params
const urlObj = new URL(href, baseUrl);
if (urlObj.hostname !== new URL(baseUrl).hostname) continue;
// Clean up URL
urlObj.hash = '';
urlObj.search = '';
let normalizedUrl = urlObj.toString().replace(/\/$/, '');
// Skip if already discovered or in queue
if (discoveredUrls.has(normalizedUrl) ||
queue.some(item => item.url === normalizedUrl)) {
continue;
}
// Only add certain types of URLs to queue
const path = urlObj.pathname;
if (path.startsWith('/tag/') ||
path.startsWith('/category/') ||
path.startsWith('/page/') ||
path.startsWith('/author/') ||
path === '/' ||
/\d{4}\/\d{2}\/\d{2}/.test(path)) {
queue.push({url: normalizedUrl, depth: depth + 1});
}
} catch (error) {
console.log(`Skipping invalid URL: ${href}`);
}
}
// Check for pagination links
const nextPageLink = document.querySelector('a.next') as HTMLAnchorElement;
if (nextPageLink?.href) {
const nextUrl = new URL(nextPageLink.href, baseUrl).toString();
if (!discoveredUrls.has(nextUrl)) {
queue.push({url: nextUrl, depth: depth + 1});
// Check for pagination links (more strict matching)
const paginationLinks = [...document.querySelectorAll('a.page-numbers, a.next, a.prev')] as HTMLAnchorElement[];
for (const link of paginationLinks) {
try {
const urlObj = new URL(link.href, baseUrl);
if (urlObj.hostname !== new URL(baseUrl).hostname) continue;
urlObj.hash = '';
urlObj.search = '';
const normalizedUrl = urlObj.toString().replace(/\/$/, '');
if (!discoveredUrls.has(normalizedUrl) &&
!queue.some(item => item.url === normalizedUrl)) {
queue.push({url: normalizedUrl, depth: depth + 1});
}
} catch (error) {
console.log(`Skipping pagination link: ${link.href}`);
}
}

Loading…
Cancel
Save