Friday, February 17, 2012

Get Blogger Posts with NodeJS

Today I wrote a blogger scraper for my new NodeJS website:  http://mikekunze.info

This script connects to my blog's RSS JSON feed and sends its data to a mongolab hosted mongoDB instance.

Pretty nifty.  Now I have content for my website, and still can use blogger's API to create that content.

One of the bonuses with this script is how it will not duplicate entries.  This feature checks title names.  If we want to be more dynamic we would create hashes on the entry content... maybe some day.

require 'iced-coffee-script'
mongoose = require 'mongoose'
httpAgent = require 'http-agent'
async = require 'async'
mongoConnect = 'mongodb://username:password@server.mongolab.com:27367/db'
mongoose.connect mongoConnect
Schema = mongoose.Schema
ObjectId = Schema.ObjectId
mongoose.model 'blogpost', new Schema {
content: String
title: String
timestamp: Date
}
agent = httpAgent.create 'blogName.blogspot.com', ['feeds/posts/default?alt=json']
agent.addListener 'next', (e, agent) =>
response = JSON.parse agent.body
eachEntry = (item, cb) ->
BlogPost = mongoose.model 'blogpost'
BlogPost.findOne { title: item.title.$t }, (err, doc) ->
if !doc
blogPost = new BlogPost()
blogPost.content = item.content.$t
blogPost.title = item.title.$t
blogPost.timestamp = item.published.$t
blogPost.save (err) =>
console.log 'Done saving ' + item.title.$t
cb()
if doc
console.log 'cached ' + item.title.$t
cb()
async.forEach response.feed.entry, eachEntry, () ->
console.log '\n'
process.exit()
agent.next()
agent.addListener 'stop', (e, agent) ->
console.log 'Agent has completed visiting all urls\n\n'
agent.start()

3 comments: