March 29, 2016

Write and deploy your own self-hosted Readability clone

Using Heroku and Node.js -- no experience required

Readability.org is a handy tool, but it also suffers from being useful, popular and free – the service is (understandably) often down, or slow.

If you’re a heavy Readability user, it might interest you to know that there exist many software libraries that attempt to recreate Readability’s main function – pulling the main body of text out of a page laden with ads and other distractions. Some of these work better than others, but Mozilla’s Readability.js seems to do the best.

However, Mozilla’s library is written to run within the browser. To use it with node, there exists readability-node, which put me in the uncomfortable position of having to fumble my way through this with node. But nevermind, I’m brave enough and so are you. Let’s go ahead and use readability-node to build our tool and take some of the load off of readability.org

Writing the app

The core functionality is simple enough. Readability-node expects a DOM, which can be provided via jsdom, so we’ll to use that library too. Just cd to your fresh project directory and run these commands:

$ echo '{"name": "Readability-clone"}'
$ npm install readability-node jsdom --save

After that, start editing your app.js and we’ll try to get this thing running. The first thing we have to do is figure out how to get the source code of an arbitrary url. Here’s what I came up with:

https = require('https');
r = require('readability-node');
jsdom = require('jsdom').jsdom;

https.get("https://adambard.com/blog/the-web-is-a-mature-platform/", function(res){
    var src = '';
    res.on('data', function(d){ src += d; });
    res.on('end', function(){
        console.log(src);
    });
});

Note that we have to follow up all our vaguely IO operations with callbacks, rather than using synchronous functions and waiting like a civilized language. This is because node.js prioritizes non-blockingness above all else, especially sanity.

You can run this code with $ node app.js to see that it works as expected.

Now that we have the source, we need to feed it through readability-node via jsdom.

https = require('https');
r = require('readability-node');
jsdom = require('jsdom').jsdom;

var uri = "https://adambard.com/blog/the-web-is-a-mature-platform/";
https.get(uri, function(res){

    var src = '';
    res.on('data', function(d){ src += d; });
    res.on('end', function(){
        var doc = jsdom(src, {features: {
            FetchExternalResources: false,
            ProcessExternalResources: false
        }});
        var article = new r.Readability(uri, doc).parse();
        console.log(article.title, "\n\n", article.content);
    });
});

And that’s the meat and potatos of the app done. The rest is just building up a UI to use a dynamic uri and display the result all pretty-like. Let’s go ahead and make a server that will show the result, and while we’re at it start fetching the URL from the query string when possible.

url = require('url');
http = require('http');
https = require('https');
r = require('readability-node');
jsdom = require('jsdom').jsdom;

function handleRequest(req, resp){
    var uri = url.parse(req.url, true).query.url
    uri = uri || "https://adambard.com/blog/the-web-is-a-mature-platform/";

    https.get(uri, function(res){

        var src = '';
        res.on('data', function(d){ src += d; });
        res.on('end', function(){
            var doc = jsdom(src, {features: {
                FetchExternalResources: false,
                ProcessExternalResources: false
            }});
            var article = new r.Readability(uri, doc).parse();
            resp.write(
                "<html><head><meta charset='utf-8'><title>"
                + article.title
                + "</title></head><body>"
                + article.content
                + "</body></html>");
        });
    });
}

var server = http.createServer(handleRequest);

server.listen(process.env.PORT || 3000, function(){ console.log("OK");});

You can run the app and point your browser at http://localhost:3000/ to see your server running in all its glory. At some point we’re going to want some templating. I like mustache, so let’s use that and set up a templates dir:

$ npm install mu2 --save
$ mkdir templates

Now we can edit templates/index.html as necessary. Here’s what I came up with:

<html>
    <head>
        <meta charset="UTF-8">
        <title>{{title}}</title>
        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Libre+Baskerville">
        <link rel="stylesheet" href="/css/style.css">
    </head>
    <body>
        <header>
            <h1><a href="/">Fungibility</a></h1>
            <form method="get" action="/">
                <label for="url">Enter URL:</label>
                <input type="text" name="url">
                <button>Load</button>
            </form>
        </header>
        <main>
            <div class="container">
                <h2>{{title}}</h2>
                {{{content}}}
            </div>
        </main>
    </body>
</html>

Now, we have a problem: we need a way to serve /css/style.css. But, since we’re really half-assing this whole thing, we’ll just serve it up from Node:

url = require('url');
path = require('path');
fs = require('fs');
http = require('http');
https = require('https');
r = require('readability-node');
jsdom = require('jsdom').jsdom;
mu = require('mu2');
mu.root = __dirname + '/templates';

function render(resp, ctx){
    // A helper function to render index.html, our only template
    mu.compileAndRender('index.html', ctx).pipe(resp);
}


function serveFile(filename, resp){
    fs.readFile(filename, "binary", function(err, file) {
      if(err) {
        resp.writeHead(500, {"Content-Type": "text/plain"});
        resp.write(err + "\n");
        resp.end();
        return;
      }

      resp.writeHead(200);
      resp.write(file, "binary");
      resp.end();
    });
}

function serveReadability(uri, resp){
    uri = uri || "https://adambard.com/blog/the-web-is-a-mature-platform/";

    https.get(uri, function(res){

        var src = '';
        res.on('data', function(d){ src += d; });
        res.on('end', function(){
            var doc = jsdom(src, {features: {
                FetchExternalResources: false,
                ProcessExternalResources: false
            }});
            var article = new r.Readability(uri, doc).parse();
            render(resp, article);
        });
    });

}

function handleRequest(req, resp){
    var req_url = url.parse(req.url, true);
    var filename = path.join(process.cwd(), req_url.pathname);
    var uri = req_url.query.url;

    fs.exists(filename, function(exists) {
        if(exists && !fs.statSync(filename).isDirectory()){
            serveFile(filename, resp);
        }else{
            serveReadability(uri, resp);
        }
    });
}

var server = http.createServer(handleRequest);

server.listen(process.env.PORT || 3000, function(){ console.log("OK");});

You can (re)start the server and see that you now have a comparatively nice layout for your app. That’s pretty much the baseline done; I’ll leave the tweaks and css as an exercise for the reader (just create a file /css/style.css in your project directory and edit it, it should just work).

Deploying to Heroku

This part is easy, too. Make sure you have the latest heroku toolbelt, then:

$ git init
$ echo 'web: node app.js' > Procfile
$ echo 'node_modules' > .gitignore
$ git add . && git commit -m "Initial commit"
$ heroku create
$ git push heroku master

Navigate to the Heroku url provided, and you should see something pop up. It’s as straightforward as that!

What if I just want to put my own clone up?

I called my project fungibility and put it up on github for anyone that just wants to use it (it also has somewhat better error handling):

$ git clone https://github.com/adambard/fungibility.git
$ cd fungibility
$ heroku create
$ git push heroku master

That should work just fine. Or, I guess, you could just use fungibility directly.