diff --git a/scrape-blog.ts b/scrape-blog.ts new file mode 100644 index 0000000..29b1784 --- /dev/null +++ b/scrape-blog.ts @@ -0,0 +1,62 @@ +import { parseHTML } from 'linkedom'; +import { writeFile } from 'fs/promises'; + +interface BlogPost { + title: string; + author: string; + content: string; + date?: string; + url: string; +} + +const BLOG_URL = 'https://cronicasperiodisticas.wordpress.com'; + +async function fetchBlogPosts(): Promise { + console.log(`Fetching blog posts from ${BLOG_URL}...`); + const response = await fetch(BLOG_URL); + const html = await response.text(); + const { document } = parseHTML(html); + + // Extract post URLs - this selector might need adjustment + const postLinks = [...document.querySelectorAll('div.posttitle a')] as HTMLAnchorElement[]; + const postUrls = postLinks.map(link => link.href); + + const posts: BlogPost[] = []; + + for (const url of postUrls) { + console.log(`Processing post: ${url}`); + try { + const postResponse = await fetch(url); + const postHtml = await postResponse.text(); + const { document: postDoc } = parseHTML(postHtml); + + const title = postDoc.querySelector('h2.pagetitle')?.textContent?.trim() || 'Untitled'; + const content = postDoc.querySelector('div.entry')?.innerHTML || ''; + + // Extract author from categories - this might need adjustment + const author = [...postDoc.querySelectorAll('a[rel="category tag"]')] + .map(el => el.textContent?.trim()) + .filter(Boolean) + .join(', ') || 'Unknown'; + + posts.push({ + title, + author, + content, + url + }); + } catch (error) { + console.error(`Failed to process ${url}:`, error); + } + } + + return posts; +} + +async function main() { + const posts = await fetchBlogPosts(); + await writeFile('posts.json', JSON.stringify(posts, null, 2)); + console.log(`Saved ${posts.length} posts to posts.json`); +} + +main().catch(console.error);