2003-03-28 21:08:50 +00:00
|
|
|
-----------------------------------------------------------------------------
|
2005-08-19 01:35:26 +00:00
|
|
|
-- Little program that checks links in HTML files, using coroutines and
|
|
|
|
-- non-blocking I/O via the dispatcher module.
|
2003-06-26 18:47:49 +00:00
|
|
|
-- LuaSocket sample files
|
2003-06-09 18:23:40 +00:00
|
|
|
-- Author: Diego Nehab
|
2003-03-28 21:08:50 +00:00
|
|
|
-----------------------------------------------------------------------------
|
2005-09-29 22:26:35 +00:00
|
|
|
local url = require("socket.url")
|
2005-08-23 05:53:14 +00:00
|
|
|
local dispatch = require("dispatch")
|
|
|
|
local http = require("socket.http")
|
|
|
|
dispatch.TIMEOUT = 10
|
2005-08-19 01:35:26 +00:00
|
|
|
|
2005-08-23 05:53:14 +00:00
|
|
|
-- make sure the user knows how to invoke us
|
2005-08-19 01:35:26 +00:00
|
|
|
arg = arg or {}
|
2013-02-25 20:28:28 -02:00
|
|
|
if #arg < 1 then
|
2005-08-19 01:35:26 +00:00
|
|
|
print("Usage:\n luasocket check-links.lua [-n] {<url>}")
|
|
|
|
exit()
|
|
|
|
end
|
|
|
|
|
2005-08-23 05:53:14 +00:00
|
|
|
-- '-n' means we are running in non-blocking mode
|
|
|
|
if arg[1] == "-n" then
|
|
|
|
-- if non-blocking I/O was requested, use real dispatcher interface
|
2005-08-19 01:35:26 +00:00
|
|
|
table.remove(arg, 1)
|
2005-08-23 05:53:14 +00:00
|
|
|
handler = dispatch.newhandler("coroutine")
|
|
|
|
else
|
|
|
|
-- if using blocking I/O, use fake dispatcher interface
|
|
|
|
handler = dispatch.newhandler("sequential")
|
2005-08-19 01:35:26 +00:00
|
|
|
end
|
|
|
|
|
|
|
|
local nthreads = 0
|
|
|
|
|
|
|
|
-- get the status of a URL using the dispatcher
|
|
|
|
function getstatus(link)
|
|
|
|
local parsed = url.parse(link, {scheme = "file"})
|
|
|
|
if parsed.scheme == "http" then
|
|
|
|
nthreads = nthreads + 1
|
|
|
|
handler:start(function()
|
|
|
|
local r, c, h, s = http.request{
|
|
|
|
method = "HEAD",
|
2005-11-22 08:33:29 +00:00
|
|
|
url = link,
|
2005-08-19 01:35:26 +00:00
|
|
|
create = handler.tcp
|
|
|
|
}
|
|
|
|
if r and c == 200 then io.write('\t', link, '\n')
|
|
|
|
else io.write('\t', link, ': ', tostring(c), '\n') end
|
|
|
|
nthreads = nthreads - 1
|
|
|
|
end)
|
|
|
|
end
|
|
|
|
end
|
2001-09-27 20:02:58 +00:00
|
|
|
|
|
|
|
function readfile(path)
|
2005-08-19 01:35:26 +00:00
|
|
|
path = url.unescape(path)
|
|
|
|
local file, error = io.open(path, "r")
|
|
|
|
if file then
|
2004-03-26 00:18:41 +00:00
|
|
|
local body = file:read("*a")
|
2005-08-19 01:35:26 +00:00
|
|
|
file:close()
|
2001-09-27 20:02:58 +00:00
|
|
|
return body
|
|
|
|
else return nil, error end
|
|
|
|
end
|
|
|
|
|
2005-08-19 01:35:26 +00:00
|
|
|
function load(u)
|
|
|
|
local parsed = url.parse(u, { scheme = "file" })
|
2004-03-26 00:18:41 +00:00
|
|
|
local body, headers, code, error
|
2004-06-04 15:15:45 +00:00
|
|
|
local base = u
|
2005-08-19 01:35:26 +00:00
|
|
|
if parsed.scheme == "http" then
|
2004-06-16 22:51:04 +00:00
|
|
|
body, code, headers = http.request(u)
|
2005-08-19 01:35:26 +00:00
|
|
|
if code == 200 then
|
|
|
|
-- if there was a redirect, update base to reflect it
|
2005-08-12 05:56:32 +00:00
|
|
|
base = headers.location or base
|
2001-09-27 20:02:58 +00:00
|
|
|
end
|
2005-08-19 01:35:26 +00:00
|
|
|
if not body then
|
2004-06-16 22:51:04 +00:00
|
|
|
error = code
|
|
|
|
end
|
2005-08-19 01:35:26 +00:00
|
|
|
elseif parsed.scheme == "file" then
|
|
|
|
body, error = readfile(parsed.path)
|
2003-03-20 00:24:44 +00:00
|
|
|
else error = string.format("unhandled scheme '%s'", parsed.scheme) end
|
2001-09-27 20:02:58 +00:00
|
|
|
return base, body, error
|
|
|
|
end
|
|
|
|
|
|
|
|
function getlinks(body, base)
|
|
|
|
-- get rid of comments
|
2003-03-20 00:24:44 +00:00
|
|
|
body = string.gsub(body, "%<%!%-%-.-%-%-%>", "")
|
2001-09-27 20:02:58 +00:00
|
|
|
local links = {}
|
|
|
|
-- extract links
|
2005-08-19 01:35:26 +00:00
|
|
|
body = string.gsub(body, '[Hh][Rr][Ee][Ff]%s*=%s*"([^"]*)"', function(href)
|
2004-06-04 15:15:45 +00:00
|
|
|
table.insert(links, url.absolute(base, href))
|
2001-09-27 20:02:58 +00:00
|
|
|
end)
|
2005-08-19 01:35:26 +00:00
|
|
|
body = string.gsub(body, "[Hh][Rr][Ee][Ff]%s*=%s*'([^']*)'", function(href)
|
2004-06-04 15:15:45 +00:00
|
|
|
table.insert(links, url.absolute(base, href))
|
2001-09-27 20:02:58 +00:00
|
|
|
end)
|
2005-08-19 01:35:26 +00:00
|
|
|
string.gsub(body, "[Hh][Rr][Ee][Ff]%s*=%s*(.-)>", function(href)
|
2004-06-04 15:15:45 +00:00
|
|
|
table.insert(links, url.absolute(base, href))
|
2001-09-27 20:02:58 +00:00
|
|
|
end)
|
|
|
|
return links
|
|
|
|
end
|
|
|
|
|
2005-08-19 01:35:26 +00:00
|
|
|
function checklinks(address)
|
|
|
|
local base, body, error = load(address)
|
2001-09-27 20:02:58 +00:00
|
|
|
if not body then print(error) return end
|
2005-08-19 01:35:26 +00:00
|
|
|
print("Checking ", base)
|
2001-09-27 20:02:58 +00:00
|
|
|
local links = getlinks(body, base)
|
2005-08-19 01:35:26 +00:00
|
|
|
for _, link in ipairs(links) do
|
|
|
|
getstatus(link)
|
2001-09-27 20:02:58 +00:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2005-08-19 01:35:26 +00:00
|
|
|
for _, address in ipairs(arg) do
|
|
|
|
checklinks(url.absolute("file:", address))
|
2001-09-27 20:02:58 +00:00
|
|
|
end
|
2005-08-19 01:35:26 +00:00
|
|
|
|
2005-11-22 08:33:29 +00:00
|
|
|
while nthreads > 0 do
|
2005-08-19 01:35:26 +00:00
|
|
|
handler:step()
|
2005-11-22 08:33:29 +00:00
|
|
|
end
|