feat: add initial blog scraper script with post extraction

3 months ago · abf216867f
parent c73fce9e92
commit abf216867f
1 changed files with 62 additions and 0 deletions
--- a/scrape-blog.ts
+++ b/scrape-blog.ts
@ -0,0 +1,62 @@
+import { parseHTML } from 'linkedom';
+import { writeFile } from 'fs/promises';
+
+interface BlogPost {
+  title: string;
+  author: string;
+  content: string;
+  date?: string;
+  url: string;
+}
+
+const BLOG_URL = 'https://cronicasperiodisticas.wordpress.com';
+
+async function fetchBlogPosts(): Promise<BlogPost[]> {
+  console.log(`Fetching blog posts from ${BLOG_URL}...`);
+  const response = await fetch(BLOG_URL);
+  const html = await response.text();
+  const { document } = parseHTML(html);
+
+  // Extract post URLs - this selector might need adjustment
+  const postLinks = [...document.querySelectorAll('div.posttitle a')] as HTMLAnchorElement[];
+  const postUrls = postLinks.map(link => link.href);
+
+  const posts: BlogPost[] = [];
+  
+  for (const url of postUrls) {
+    console.log(`Processing post: ${url}`);
+    try {
+      const postResponse = await fetch(url);
+      const postHtml = await postResponse.text();
+      const { document: postDoc } = parseHTML(postHtml);
+
+      const title = postDoc.querySelector('h2.pagetitle')?.textContent?.trim() || 'Untitled';
+      const content = postDoc.querySelector('div.entry')?.innerHTML || '';
+      
+      // Extract author from categories - this might need adjustment
+      const author = [...postDoc.querySelectorAll('a[rel="category tag"]')]
+        .map(el => el.textContent?.trim())
+        .filter(Boolean)
+        .join(', ') || 'Unknown';
+
+      posts.push({
+        title,
+        author,
+        content,
+        url
+      });
+    } catch (error) {
+      console.error(`Failed to process ${url}:`, error);
+    }
+  }
+
+  return posts;
+}
+
+async function main() {
+  const posts = await fetchBlogPosts();
+  await writeFile('posts.json', JSON.stringify(posts, null, 2));
+  console.log(`Saved ${posts.length} posts to posts.json`);
+}
+
+main().catch(console.error);