feat: add initial blog scraper script with post extraction
parent
c73fce9e92
commit
abf216867f
@ -0,0 +1,62 @@
|
|||||||
|
import { parseHTML } from 'linkedom';
|
||||||
|
import { writeFile } from 'fs/promises';
|
||||||
|
|
||||||
|
interface BlogPost {
|
||||||
|
title: string;
|
||||||
|
author: string;
|
||||||
|
content: string;
|
||||||
|
date?: string;
|
||||||
|
url: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
const BLOG_URL = 'https://cronicasperiodisticas.wordpress.com';
|
||||||
|
|
||||||
|
async function fetchBlogPosts(): Promise<BlogPost[]> {
|
||||||
|
console.log(`Fetching blog posts from ${BLOG_URL}...`);
|
||||||
|
const response = await fetch(BLOG_URL);
|
||||||
|
const html = await response.text();
|
||||||
|
const { document } = parseHTML(html);
|
||||||
|
|
||||||
|
// Extract post URLs - this selector might need adjustment
|
||||||
|
const postLinks = [...document.querySelectorAll('div.posttitle a')] as HTMLAnchorElement[];
|
||||||
|
const postUrls = postLinks.map(link => link.href);
|
||||||
|
|
||||||
|
const posts: BlogPost[] = [];
|
||||||
|
|
||||||
|
for (const url of postUrls) {
|
||||||
|
console.log(`Processing post: ${url}`);
|
||||||
|
try {
|
||||||
|
const postResponse = await fetch(url);
|
||||||
|
const postHtml = await postResponse.text();
|
||||||
|
const { document: postDoc } = parseHTML(postHtml);
|
||||||
|
|
||||||
|
const title = postDoc.querySelector('h2.pagetitle')?.textContent?.trim() || 'Untitled';
|
||||||
|
const content = postDoc.querySelector('div.entry')?.innerHTML || '';
|
||||||
|
|
||||||
|
// Extract author from categories - this might need adjustment
|
||||||
|
const author = [...postDoc.querySelectorAll('a[rel="category tag"]')]
|
||||||
|
.map(el => el.textContent?.trim())
|
||||||
|
.filter(Boolean)
|
||||||
|
.join(', ') || 'Unknown';
|
||||||
|
|
||||||
|
posts.push({
|
||||||
|
title,
|
||||||
|
author,
|
||||||
|
content,
|
||||||
|
url
|
||||||
|
});
|
||||||
|
} catch (error) {
|
||||||
|
console.error(`Failed to process ${url}:`, error);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return posts;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
const posts = await fetchBlogPosts();
|
||||||
|
await writeFile('posts.json', JSON.stringify(posts, null, 2));
|
||||||
|
console.log(`Saved ${posts.length} posts to posts.json`);
|
||||||
|
}
|
||||||
|
|
||||||
|
main().catch(console.error);
|
Loading…
Reference in New Issue