|
|
@ -38,23 +38,75 @@ async function fetchWithRetry(url: string, retries = 3): Promise<string> {
|
|
|
|
throw new Error('Max retries reached');
|
|
|
|
throw new Error('Max retries reached');
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
async function fetchBlogPosts(): Promise<BlogPost[]> {
|
|
|
|
async function discoverAllPostUrls(baseUrl: string, maxDepth = 3): Promise<string[]> {
|
|
|
|
console.log(`Fetching blog posts from ${BLOG_URL}...`);
|
|
|
|
const discoveredUrls = new Set<string>();
|
|
|
|
const html = await fetchWithRetry(BLOG_URL);
|
|
|
|
const queue: {url: string; depth: number}[] = [{url: baseUrl, depth: 0}];
|
|
|
|
const { document } = parseHTML(html);
|
|
|
|
const postUrls = new Set<string>();
|
|
|
|
|
|
|
|
|
|
|
|
// Extract and filter post URLs
|
|
|
|
console.log(`Starting URL discovery from ${baseUrl} with max depth ${maxDepth}`);
|
|
|
|
const postLinks = [...document.querySelectorAll('div.posttitle a')] as HTMLAnchorElement[];
|
|
|
|
|
|
|
|
const postUrls = postLinks
|
|
|
|
while (queue.length > 0) {
|
|
|
|
.map(link => link.href)
|
|
|
|
const {url, depth} = queue.shift()!;
|
|
|
|
.filter(url => {
|
|
|
|
|
|
|
|
// Only include URLs that look like actual posts
|
|
|
|
if (depth > maxDepth) continue;
|
|
|
|
const isPost = /\/\d{4}\/\d{2}\/\d{2}\//.test(url);
|
|
|
|
if (discoveredUrls.has(url)) continue;
|
|
|
|
if (!isPost) {
|
|
|
|
|
|
|
|
console.log(`Skipping non-post URL: ${url}`);
|
|
|
|
console.log(`Processing URL (depth ${depth}): ${url}`);
|
|
|
|
|
|
|
|
discoveredUrls.add(url);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
|
|
|
const html = await fetchWithRetry(url);
|
|
|
|
|
|
|
|
const {document} = parseHTML(html);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Check if this is a post URL
|
|
|
|
|
|
|
|
if (/\d{4}\/\d{2}\/\d{2}\//.test(url)) {
|
|
|
|
|
|
|
|
postUrls.add(url);
|
|
|
|
|
|
|
|
console.log(`Found post URL: ${url}`);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Find all links on page
|
|
|
|
|
|
|
|
const links = [...document.querySelectorAll('a[href]')] as HTMLAnchorElement[];
|
|
|
|
|
|
|
|
for (const link of links) {
|
|
|
|
|
|
|
|
const href = link.href;
|
|
|
|
|
|
|
|
if (!href || href.startsWith('#')) continue;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Normalize URL and check if it belongs to our domain
|
|
|
|
|
|
|
|
const urlObj = new URL(href, baseUrl);
|
|
|
|
|
|
|
|
if (urlObj.hostname === new URL(baseUrl).hostname) {
|
|
|
|
|
|
|
|
const normalizedUrl = urlObj.toString();
|
|
|
|
|
|
|
|
if (!discoveredUrls.has(normalizedUrl)) {
|
|
|
|
|
|
|
|
queue.push({url: normalizedUrl, depth: depth + 1});
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return isPost;
|
|
|
|
|
|
|
|
});
|
|
|
|
// Check for pagination links
|
|
|
|
|
|
|
|
const nextPageLink = document.querySelector('a.next') as HTMLAnchorElement;
|
|
|
|
|
|
|
|
if (nextPageLink?.href) {
|
|
|
|
|
|
|
|
const nextUrl = new URL(nextPageLink.href, baseUrl).toString();
|
|
|
|
|
|
|
|
if (!discoveredUrls.has(nextUrl)) {
|
|
|
|
|
|
|
|
queue.push({url: nextUrl, depth: depth + 1});
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
await setTimeout(SCRAPE_CONFIG.delayBetweenRequests);
|
|
|
|
|
|
|
|
} catch (error) {
|
|
|
|
|
|
|
|
console.error(`Error processing ${url}:`, error instanceof Error ? error.message : error);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
console.log(`Discovered ${postUrls.size} potential post URLs`);
|
|
|
|
|
|
|
|
return Array.from(postUrls);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async function fetchBlogPosts(): Promise<BlogPost[]> {
|
|
|
|
|
|
|
|
console.log(`Starting blog post scraping from ${BLOG_URL}...`);
|
|
|
|
|
|
|
|
const postUrls = await discoverAllPostUrls(BLOG_URL);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (postUrls.length === 0) {
|
|
|
|
|
|
|
|
console.warn('No post URLs discovered!');
|
|
|
|
|
|
|
|
return [];
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Process posts with limited concurrency and delays
|
|
|
|
// Process posts with limited concurrency and delays
|
|
|
|
const processPost = async (url: string): Promise<BlogPost | null> => {
|
|
|
|
const processPost = async (url: string): Promise<BlogPost | null> => {
|
|
|
@ -90,15 +142,26 @@ async function fetchBlogPosts(): Promise<BlogPost[]> {
|
|
|
|
}
|
|
|
|
}
|
|
|
|
};
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
// Process posts sequentially with delay
|
|
|
|
// Process posts with progress logging
|
|
|
|
const results = [];
|
|
|
|
const posts: BlogPost[] = [];
|
|
|
|
for (const url of postUrls) { // Process all posts
|
|
|
|
let processed = 0;
|
|
|
|
results.push(await processPost(url));
|
|
|
|
const total = postUrls.length;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
const posts: BlogPost[] = results.filter((post): post is BlogPost => post !== null);
|
|
|
|
for (const url of postUrls) {
|
|
|
|
|
|
|
|
processed++;
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
|
|
|
console.log(`[${processed}/${total}] Processing post: ${url}`);
|
|
|
|
|
|
|
|
const post = await processPost(url);
|
|
|
|
|
|
|
|
if (post) {
|
|
|
|
|
|
|
|
posts.push(post);
|
|
|
|
|
|
|
|
console.log(`[${processed}/${total}] Successfully processed: ${post.title}`);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
} catch (error) {
|
|
|
|
|
|
|
|
console.error(`[${processed}/${total}] Failed to process ${url}:`, error instanceof Error ? error.message : error);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
console.log(`Successfully processed ${posts.length}/${postUrls.length} posts`);
|
|
|
|
console.log(`Finished processing. Success rate: ${posts.length}/${postUrls.length} (${Math.round((posts.length/postUrls.length)*100)}%)`);
|
|
|
|
return posts;
|
|
|
|
return posts;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|