Write and deploy your own self-hosted Readability clone
Using Heroku and Node.js -- no experience required
Readability.org is a handy tool, but it also suffers from being useful, popular and free – the service is (understandably) often down, or slow.
If you’re a heavy Readability user, it might interest you to know that there exist many software libraries that attempt to recreate Readability’s main function – pulling the main body of text out of a page laden with ads and other distractions. Some of these work better than others, but Mozilla’s Readability.js seems to do the best.
However, Mozilla’s library is written to run within the browser. To use it with node,
there exists readability-node
, which put me in the uncomfortable position of having to fumble my
way through this with node. But nevermind, I’m brave enough and so are you. Let’s go ahead and use
readability-node to build our tool and take some of the load off of readability.org
Writing the app
The core functionality is simple enough. Readability-node expects a DOM, which can be provided
via jsdom
, so we’ll to use that library too. Just cd to your fresh project directory and run
these commands:
$ echo '{"name": "Readability-clone"}'
$ npm install readability-node jsdom --save
After that, start editing your app.js
and we’ll try to get this thing running.
The first thing we have to do is figure out how to get the source code of an arbitrary
url. Here’s what I came up with:
https = require('https');
r = require('readability-node');
jsdom = require('jsdom').jsdom;
https.get("https://adambard.com/blog/the-web-is-a-mature-platform/", function(res){
var src = '';
res.on('data', function(d){ src += d; });
res.on('end', function(){
console.log(src);
});
});
Note that we have to follow up all our vaguely IO operations with callbacks, rather than using synchronous functions and waiting like a civilized language. This is because node.js prioritizes non-blockingness above all else, especially sanity.
You can run this code with $ node app.js
to see that it works as expected.
Now that we have the source, we need to feed it through readability-node via jsdom.
https = require('https');
r = require('readability-node');
jsdom = require('jsdom').jsdom;
var uri = "https://adambard.com/blog/the-web-is-a-mature-platform/";
https.get(uri, function(res){
var src = '';
res.on('data', function(d){ src += d; });
res.on('end', function(){
var doc = jsdom(src, {features: {
FetchExternalResources: false,
ProcessExternalResources: false
}});
var article = new r.Readability(uri, doc).parse();
console.log(article.title, "\n\n", article.content);
});
});
And that’s the meat and potatos of the app done. The rest is just building up a UI to use a dynamic uri and display the result all pretty-like. Let’s go ahead and make a server that will show the result, and while we’re at it start fetching the URL from the query string when possible.
url = require('url');
http = require('http');
https = require('https');
r = require('readability-node');
jsdom = require('jsdom').jsdom;
function handleRequest(req, resp){
var uri = url.parse(req.url, true).query.url
uri = uri || "https://adambard.com/blog/the-web-is-a-mature-platform/";
https.get(uri, function(res){
var src = '';
res.on('data', function(d){ src += d; });
res.on('end', function(){
var doc = jsdom(src, {features: {
FetchExternalResources: false,
ProcessExternalResources: false
}});
var article = new r.Readability(uri, doc).parse();
resp.write(
"<html><head><meta charset='utf-8'><title>"
+ article.title
+ "</title></head><body>"
+ article.content
+ "</body></html>");
});
});
}
var server = http.createServer(handleRequest);
server.listen(process.env.PORT || 3000, function(){ console.log("OK");});
You can run the app and point your browser at http://localhost:3000/ to
see your server running in all its glory.
At some point we’re going to want some templating. I like mustache, so let’s
use that and set up a templates
dir:
$ npm install mu2 --save
$ mkdir templates
Now we can edit templates/index.html
as necessary. Here’s what I came up with:
<html>
<head>
<meta charset="UTF-8">
<title>{{title}}</title>
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Libre+Baskerville">
<link rel="stylesheet" href="/css/style.css">
</head>
<body>
<header>
<h1><a href="/">Fungibility</a></h1>
<form method="get" action="/">
<label for="url">Enter URL:</label>
<input type="text" name="url">
<button>Load</button>
</form>
</header>
<main>
<div class="container">
<h2>{{title}}</h2>
{{{content}}}
</div>
</main>
</body>
</html>
Now, we have a problem: we need a way to serve /css/style.css
. But, since we’re really half-assing
this whole thing, we’ll just serve it up from Node:
url = require('url');
path = require('path');
fs = require('fs');
http = require('http');
https = require('https');
r = require('readability-node');
jsdom = require('jsdom').jsdom;
mu = require('mu2');
mu.root = __dirname + '/templates';
function render(resp, ctx){
// A helper function to render index.html, our only template
mu.compileAndRender('index.html', ctx).pipe(resp);
}
function serveFile(filename, resp){
fs.readFile(filename, "binary", function(err, file) {
if(err) {
resp.writeHead(500, {"Content-Type": "text/plain"});
resp.write(err + "\n");
resp.end();
return;
}
resp.writeHead(200);
resp.write(file, "binary");
resp.end();
});
}
function serveReadability(uri, resp){
uri = uri || "https://adambard.com/blog/the-web-is-a-mature-platform/";
https.get(uri, function(res){
var src = '';
res.on('data', function(d){ src += d; });
res.on('end', function(){
var doc = jsdom(src, {features: {
FetchExternalResources: false,
ProcessExternalResources: false
}});
var article = new r.Readability(uri, doc).parse();
render(resp, article);
});
});
}
function handleRequest(req, resp){
var req_url = url.parse(req.url, true);
var filename = path.join(process.cwd(), req_url.pathname);
var uri = req_url.query.url;
fs.exists(filename, function(exists) {
if(exists && !fs.statSync(filename).isDirectory()){
serveFile(filename, resp);
}else{
serveReadability(uri, resp);
}
});
}
var server = http.createServer(handleRequest);
server.listen(process.env.PORT || 3000, function(){ console.log("OK");});
You can (re)start the server and see that you now have a comparatively nice layout for your
app. That’s pretty much the baseline done; I’ll leave the tweaks and css as an exercise for
the reader (just create a file /css/style.css
in your project directory and edit it, it should
just work).
Deploying to Heroku
This part is easy, too. Make sure you have the latest heroku toolbelt, then:
$ git init
$ echo 'web: node app.js' > Procfile
$ echo 'node_modules' > .gitignore
$ git add . && git commit -m "Initial commit"
$ heroku create
$ git push heroku master
Navigate to the Heroku url provided, and you should see something pop up. It’s as straightforward as that!
What if I just want to put my own clone up?
I called my project fungibility and put it up on github for anyone that just wants to use it (it also has somewhat better error handling):
$ git clone https://github.com/adambard/fungibility.git
$ cd fungibility
$ heroku create
$ git push heroku master
That should work just fine. Or, I guess, you could just use fungibility directly.