Module:Webarchive
From Self-sufficiency
Revision as of 18:18, 25 October 2017 by GreenC (Talk) (change ".il" to ".li" / fix uppercase in hostname / add mementoweb detection)
Documentation for this module may be created at Module:Webarchive/doc
--[[ ---------------------------------- Lua module implementing the {{webarchive}} template. A merger of the functionality of three templates: {{wayback}}, {{webcite}} and {{cite archives}} ]] local p = {} --[[--------------------------< inlineError >----------------------- Critical error. Render output completely in red. Add to tracking category. ]] local function inlineError(arg, msg) track["Category:Webarchive template errors"] = 1 return '<span style="font-size:100%" class="error citation-comment">Error in webarchive template: Check <code style="color:inherit; border:inherit; padding:inherit;">|' .. arg .. '=</code> value. ' .. msg .. '</span>' end --[[--------------------------< inlineRed >----------------------- Render a text fragment in red, such as a warning as part of the final output. Add tracking category. ]] local function inlineRed(msg, trackmsg) if trackmsg == "warning" then track["Category:Webarchive template warnings"] = 1 elseif trackmsg == "error" then track["Category:Webarchive template errors"] = 1 end return '<span style="font-size:100%" class="error citation-comment">' .. msg .. '</span>' end --[[--------------------------< trimArg >----------------------- trimArg returns nil if arg is "" while trimArg2 returns 'true' if arg is "" trimArg2 is for args that might accept an empty value, as an on/off switch like nolink= ]] local function trimArg(arg) if arg == "" or arg == nil then return nil else return mw.text.trim(arg) end end local function trimArg2(arg) if arg == nil then return nil else return mw.text.trim(arg) end end --[[--------------------------< base62 >----------------------- Convert base-62 to base-10 Credit: https://de.wikipedia.org/wiki/Modul:Expr ]] local function base62( value ) local r = 1 if value:match( "^%w+$" ) then local n = #value local k = 1 local c r = 0 for i = n, 1, -1 do c = value:byte( i, i ) if c >= 48 and c <= 57 then c = c - 48 elseif c >= 65 and c <= 90 then c = c - 55 elseif c >= 97 and c <= 122 then c = c - 61 else -- How comes? r = 1 break -- for i end r = r + c * k k = k * 62 end -- for i end return r end --[[--------------------------< tableLength >----------------------- Given a 1-D table, return number of elements ]] local function tableLength(T) local count = 0 for _ in pairs(T) do count = count + 1 end return count end --[[--------------------------< dateFormat >----------------------- Given a date string, return its format: dmy, mdy, iso, ymd If unable to determine return nil ]] local function dateFormat(date) local dt = {} dt.split = {} dt.split = mw.text.split(date, "-") if tableLength(dt.split) == 3 then if tonumber(dt.split[1]) > 1900 and tonumber(dt.split[1]) < 2200 and tonumber(dt.split[2]) and tonumber(dt.split[3]) then return "iso" else return nil end end dt.split = mw.text.split(date, " ") if tableLength(dt.split) == 3 then if tonumber(dt.split[3]) then if tonumber(dt.split[3]) > 1900 and tonumber(dt.split[3]) < 2200 then if tonumber(dt.split[1]) then return "dmy" else return "mdy" end else if tonumber(dt.split[1]) then if tonumber(dt.split[1]) > 1900 and tonumber(dt.split[1]) < 2200 then return "ymd" end end end end end return nil end --[[--------------------------< makeDate >----------------------- Given a zero-padded 4-digit year, 2-digit month and 2-digit day, return a full date in df format df = mdy, dmy, iso, ymd ]] local function makeDate(year, month, day, df) if not year or year == "" or not month or month == "" or not day or day == "" then return nil end local zmonth = month -- month with leading 0 month = month:match("0*(%d+)") -- month without leading 0 if tonumber(month) < 1 or tonumber(month) > 12 then return year end local nmonth = os.date("%B", os.time{year=2000, month=month, day=1} ) -- month in name form if not nmonth then return year end local zday = day day = zday:match("0*(%d+)") if tonumber(day) < 1 or tonumber(day) > 31 then if df == "mdy" or df == "dmy" then return nmonth .. " " .. year elseif df == "iso" then return year .. "-" .. zmonth elseif df == "ymd" then return year .. " " .. nmonth else return nmonth .. " " .. year end end if df == "mdy" then return nmonth .. " " .. day .. ", " .. year -- September 1, 2016 elseif df == "dmy" then return day .. " " .. nmonth .. " " .. year -- 1 September 2016 elseif df == "iso" then return year .. "-" .. zmonth .. "-" .. zday -- 2016-09-01 elseif df == "ymd" then return year .. " " .. nmonth .. " " .. cday -- 2016 September 1 else return nmonth .. " " .. day .. ", " .. year -- September 1, 2016 end end --[[--------------------------< decodeWebciteDate >----------------------- Given a URI-path to Webcite (eg. /67xHmVFWP) return the encoded date in df format ]] local function decodeWebciteDate(path, df) local dt = {} dt.split = {} dt.split = mw.text.split(path, "/") -- valid URL formats that are not base62 -- http://www.webcitation.org/query?id=1138911916587475 -- http://www.webcitation.org/query?url=http..&date=2012-06-01+21:40:03 -- http://www.webcitation.org/1138911916587475 -- http://www.webcitation.org/cache/73e53dd1f16cf8c5da298418d2a6e452870cf50e -- http://www.webcitation.org/getfile.php?fileid=1c46e791d68e89e12d0c2532cc3cf629b8bc8c8e if mw.ustring.find( dt.split[2], "query", 1, plain) or mw.ustring.find( dt.split[2], "cache", 1, plain) or mw.ustring.find( dt.split[2], "getfile", 1, plain) or tonumber(dt.split[2]) then return "query" end dt.full = os.date("%Y %m %d", string.sub(string.format("%d", base62(dt.split[2])),1,10) ) dt.split = mw.text.split(dt.full, " ") dt.year = dt.split[1] dt.month = dt.split[2] dt.day = dt.split[3] if not tonumber(dt.year) or not tonumber(dt.month) or not tonumber(dt.day) then return inlineRed("[Date error] (1)", "error") end if tonumber(dt.month) > 12 or tonumber(dt.day) > 31 or tonumber(dt.month) < 1 then return inlineRed("[Date error] (2)", "error") end if tonumber(dt.year) > tonumber(os.date("%Y")) or tonumber(dt.year) < 1900 then return inlineRed("[Date error] (3)", "error") end fulldate = makeDate(dt.year, dt.month, dt.day, df) if not fulldate then return inlineRed("[Date error] (4)", "error") else return fulldate end end --[[--------------------------< decodeWaybackDate >----------------------- Given a URI-path to Wayback (eg. /web/20160901010101/http://example.com ) return the formatted date eg. "September 1, 2016" in df format Handle non-digits in snapshot ID such as "re_" and "-" and "*" ]] local function decodeWaybackDate(path, df) local snapdate, snapdatelong, currdate, fulldate local safe = path snapdate = string.gsub(safe, "^/w?e?b?/?", "") -- Remove leading "/web/" or "/" safe = snapdate local N = mw.text.split(safe, "/") snapdate = N[1] if snapdate == "*" then -- eg. /web/*/http.. return "index" end safe = snapdate snapdate = string.gsub(safe, "[a-z][a-z]_[0-9]?$", "") -- Remove any trailing "re_" from date safe = snapdate snapdate = string.gsub(safe, "[-]", "") -- Remove dashes from date eg. 2015-01-01 safe = snapdate snapdate = string.gsub(safe, "[*]$", "") -- Remove trailing "*" if not tonumber(snapdate) then return inlineRed("[Date error] (2)", "error") end local dlen = string.len(snapdate) if dlen < 4 then return inlineRed("[Date error] (3)", "error") end if dlen < 14 then snapdatelong = snapdate .. string.rep("0", 14 - dlen) else snapdatelong = snapdate end local year = string.sub(snapdatelong, 1, 4) local month = string.sub(snapdatelong, 5, 6) local day = string.sub(snapdatelong, 7, 8) if not tonumber(year) or not tonumber(month) or not tonumber(day) then return inlineRed("[Date error] (4)", "error") end if tonumber(month) > 12 or tonumber(day) > 31 or tonumber(month) < 1 then return inlineRed("[Date error] (5)", "error") end currdate = os.date("%Y") if tonumber(year) > tonumber(currdate) or tonumber(year) < 1900 then return inlineRed("[Date error] (6)", "error") end fulldate = makeDate(year, month, day, df) if not fulldate then return inlineRed("[Date error] (7)", "error") else return fulldate end end --[[--------------------------< decodeArchiveisDate >----------------------- Given an Archive.is "long link" URI-path (e.g. /2016.08.28-144552/http://example.com) return the date in df format (e.g. if df = dmy, return 28 August 2016) Handles "." and "-" in snapshot date, so 2016.08.28-144552 is same as 20160828144552 ]] local function decodeArchiveisDate(path, df) local snapdate, snapdatelong, currdate, fulldate local safe = path local N = mw.text.split(safe, "/") safe = N[2] -- get snapshot date, e.g. 2016.08.28-144552 snapdate = string.gsub(safe, "[%.%-]", "") -- remove periods and hyphens if not tonumber(snapdate) then -- if not numeric, it is "short link", not date return "short link" -- e.g. http://archive.is/hD1qz end local dlen = string.len(snapdate) if dlen < 4 then return inlineRed("[Date error] (3)", "error") end if dlen < 14 then snapdatelong = snapdate .. string.rep("0", 14 - dlen) else snapdatelong = snapdate end local year = string.sub(snapdatelong, 1, 4) local month = string.sub(snapdatelong, 5, 6) local day = string.sub(snapdatelong, 7, 8) if not tonumber(year) or not tonumber(month) or not tonumber(day) then return inlineRed("[Date error] (4)", "error") end if tonumber(month) > 12 or tonumber(day) > 31 or tonumber(month) < 1 then return inlineRed("[Date error] (5)", "error") end currdate = os.date("%Y") if tonumber(year) > tonumber(currdate) or tonumber(year) < 1900 then return inlineRed("[Date error] (6)", "error") end fulldate = makeDate(year, month, day, df) if not fulldate then return inlineRed("[Date error] (7)", "error") else return fulldate end end --[[--------------------------< serviceName >----------------------- Given a domain extracted by mw.uri.new() (eg. web.archive.org) set tail string and service ID ]] local function serviceName(host, nolink) local tracking = "Category:Webarchive template other archives" local bracketopen = "[[" local bracketclose = "]]" if nolink then bracketopen = "" bracketclose = "" end ulx.url1.service = "other" ulx.url1.tail = " at " .. ulx.url1.host .. " " .. inlineRed("Error: unknown archive URL") host = string.lower(host) if mw.ustring.find( host, "europarchive.org", 1, plain ) then -- any containing "archive.org" listed before Wayback to avoid disambiguation ulx.url1.tail = " at the " .. bracketopen .. "National Library of Ireland" .. bracketclose elseif mw.ustring.find( host, "webarchive.org.uk", 1, plain ) then ulx.url1.tail = " at the " .. bracketopen .. "UK Web Archive" .. bracketclose elseif mw.ustring.find( host, "archive.org", 1, plain ) then ulx.url1.service = "wayback" ulx.url1.tail = " at the " .. bracketopen .. "Wayback Machine" .. bracketclose tracking = "Category:Webarchive template wayback links" elseif mw.ustring.find( host, "webcitation.org", 1, plain ) then ulx.url1.service = "webcite" ulx.url1.tail = " at " .. bracketopen .. "WebCite" .. bracketclose tracking = "Category:Webarchive template webcite links" elseif mw.ustring.find( host, "archive.is", 1, plain ) then ulx.url1.service = "archiveis" ulx.url1.tail = " at " .. bracketopen .. "Archive.is" .. bracketclose tracking = "Category:Webarchive template archiveis links" elseif mw.ustring.find( host, "archive.fo", 1, plain ) then ulx.url1.service = "archiveis" ulx.url1.tail = " at " .. bracketopen .. "Archive.is" .. bracketclose tracking = "Category:Webarchive template archiveis links" elseif mw.ustring.find( host, "archive.today", 1, plain ) then ulx.url1.service = "archiveis" ulx.url1.tail = " at " .. bracketopen .. "Archive.is" .. bracketclose tracking = "Category:Webarchive template archiveis links" elseif mw.ustring.find( host, "archive.li", 1, plain ) then ulx.url1.service = "archiveis" ulx.url1.tail = " at " .. bracketopen .. "Archive.is" .. bracketclose tracking = "Category:Webarchive template archiveis links" elseif mw.ustring.find( host, "archive.ec", 1, plain ) then ulx.url1.service = "archiveis" ulx.url1.tail = " at " .. bracketopen .. "Archive.is" .. bracketclose tracking = "Category:Webarchive template archiveis links" elseif mw.ustring.find( host, "archive[-]it.org", 1, plain ) then ulx.url1.service = "archiveit" ulx.url1.tail = " at " .. bracketopen .. "Archive-It" .. bracketclose elseif mw.ustring.find( host, "arquivo.pt", 1, plain) then ulx.url1.tail = " at the " .. "Portuguese Web Archive" elseif mw.ustring.find( host, "loc.gov", 1, plain ) then ulx.url1.tail = " at the " .. bracketopen .. "Library of Congress" .. bracketclose elseif mw.ustring.find( host, "webharvest.gov", 1, plain ) then ulx.url1.tail = " at the " .. bracketopen .. "National Archives and Records Administration" .. bracketclose elseif mw.ustring.find( host, "bibalex.org", 1, plain ) then ulx.url1.tail = " at " .. "[[Bibliotheca_Alexandrina#Internet_Archive_partnership|Bibliotheca Alexandrina]]" elseif mw.ustring.find( host, "collectionscanada", 1, plain ) then ulx.url1.tail = " at the " .. "Canadian Government Web Archive" elseif mw.ustring.find( host, "haw.nsk", 1, plain ) then ulx.url1.tail = " at the " .. "Croatian Web Archive (HAW)" elseif mw.ustring.find( host, "veebiarhiiv.digar.ee", 1, plain ) then ulx.url1.tail = " at the " .. "Estonian Web Archive" elseif mw.ustring.find( host, "vefsafn.is", 1, plain ) then ulx.url1.tail = " at the " .. "[[National and University Library of Iceland]]" elseif mw.ustring.find( host, "proni.gov", 1, plain ) then ulx.url1.tail = " at the " .. bracketopen .. "Public Record Office of Northern Ireland" .. bracketclose elseif mw.ustring.find( host, "uni[-]lj.si", 1, plain ) then ulx.url1.tail = " at the " .. "Slovenian Web Archive" elseif mw.ustring.find( host, "stanford.edu", 1, plain ) then ulx.url1.tail = " at the " .. "[[Stanford University Libraries|Stanford Web Archive]]" elseif mw.ustring.find( host, "nationalarchives.gov.uk", 1, plain ) then ulx.url1.tail = " at the " .. bracketopen .. "UK Government Web Archive" .. bracketclose elseif mw.ustring.find( host, "parliament.uk", 1, plain ) then ulx.url1.tail = " at the " .. bracketopen .. "UK Parliament's Web Archive" .. bracketclose elseif mw.ustring.find( host, "nlb.gov.sg", 1, plain ) then ulx.url1.tail = " at " .. "Web Archive Singapore" elseif mw.ustring.find( host, "pandora.nla.gov.au", 1, plain ) then ulx.url1.tail = " at " .. bracketopen .. "Pandora Archive" .. bracketclose elseif mw.ustring.find( host, "perma.cc", 1, plain ) then ulx.url1.tail = " at " .. bracketopen .. "Perma.cc" .. bracketclose elseif mw.ustring.find( host, "perma-archives.cc", 1, plain ) then ulx.url1.tail = " at " .. bracketopen .. "Perma.cc" .. bracketclose elseif mw.ustring.find( host, "screenshots.com", 1, plain ) then ulx.url1.tail = " at Screenshots" elseif mw.ustring.find( host, "wikiwix.com", 1, plain ) then ulx.url1.tail = " at Wikiwix" elseif mw.ustring.find( host, "freezepage.com", 1, plain ) then ulx.url1.tail = " at Freezepage" elseif mw.ustring.find( host, "webcache.googleusercontent.com", 1, plain ) then ulx.url1.tail = " at Google Cache" elseif mw.ustring.find( host, "timetravel.mementoweb.org", 1, plain ) then ulx.url1.tail = " at " .. bracketopen .. "Memento Project" .. bracketclose else tracking = "Category:Webarchive template unknown archives" end track[tracking] = 1 end --[[--------------------------< parseExtraArgs >----------------------- Parse numbered arguments starting at 2, such as url2..url10, date2..date10, title2..title10 For example: {{webarchive |url=.. |url4=.. |url7=..}} Three url arguments not in numeric sequence (1..4..7). Function only processes arguments numbered 2 or greater (in this case 4 and 7) It creates numeric sequenced table entries like: urlx.url2.url = <argument value for url4> urlx.url3.url = <argument value for url7> Returns the number of URL arguments found numbered 2 or greater (in this case returns "2") ]] local function parseExtraArgs() local i, j, argurl, argurl2, argdate, argtitle j = 2 for i = 2, maxurls do argurl = "url" .. i if trimArg(args[argurl]) then argurl2 = "url" .. j ulx[argurl2] = {} ulx[argurl2]["url"] = args[argurl] argdate = "date" .. j if trimArg(args[argdate]) then ulx[argurl2]["date"] = args[argdate] else ulx[argurl2]["date"] = inlineRed("[Date missing]", "warning") end argtitle = "title" .. j if trimArg(args[argtitle]) then ulx[argurl2]["title"] = args[argtitle] else ulx[argurl2]["title"] = nil end j = j + 1 end end if j == 2 then return 0 else return j - 2 end end --[[--------------------------< comma >----------------------- Given a date string, return "," if it's MDY ]] local function comma(date) local N = mw.text.split(date, " ") local O = mw.text.split(N[1], "-") -- for ISO if O[1] == "index" then return "" end if not tonumber(O[1]) then return "," else return "" end end --[[--------------------------< createTracking >----------------------- Return data in track[] ie. tracking categories ]] local function createTracking() local sand = "" if tableLength(track) > 0 then for key,_ in pairs(track) do sand = sand .. "[[" .. key .. "]]" end end return sand end --[[--------------------------< createRendering >----------------------- Return a rendering of the data in ulx[][] ]] local function createRendering() local sand, displayheader, displayfield local period1 = "" -- For backwards compat with {{wayback}} local period2 = "." local indexstr = "archived" if ulx.url1.date == "index" then indexstr = "archive" end -- For {{wayback}}, {{webcite}} if ulx.url1.format == "none" then if not ulx.url1.title and not ulx.url1.date then -- No title. No date sand = "[" .. ulx.url1.url .. " Archived]" .. ulx.url1.tail elseif not ulx.url1.title and ulx.url1.date then -- No title. Date. if ulx.url1.service == "wayback" then period1 = "." period2 = "" end sand = "[" .. ulx.url1.url .. " Archived] " .. ulx.url1.date .. comma(ulx.url1.date) .. ulx.url1.tail .. period1 elseif ulx.url1.title and not ulx.url1.date then -- Title. No date. sand = "[" .. ulx.url1.url .. " " .. ulx.url1.title .. "]" .. ulx.url1.tail elseif ulx.url1.title and ulx.url1.date then -- Title. Date. sand = "[" .. ulx.url1.url .. " " .. ulx.url1.title .. "]" .. ulx.url1.tail .. " (" .. indexstr .. " " .. ulx.url1.date .. ")" else return nil end if ulx.url1.extraurls > 0 then -- For multiple archive URLs local tot = ulx.url1.extraurls + 1 sand = sand .. period2 .. " Additional archives: " for i=2,tot do local indx = "url" .. i if ulx[indx]["title"] then displayfield = "title" else displayfield = "date" end sand = sand .. "[" .. ulx[indx]["url"] .. " " .. ulx[indx][displayfield] .. "]" if i == tot then sand = sand .. "." else sand = sand .. ", " end end else return sand end return sand -- For {{cite archives}} else if ulx.url1.format == "addlarchives" then -- Multiple archive services displayheader = "Additional archives: " else -- Multiple pages from the same archive displayheader = "Additional pages archived on " .. ulx.url1.date .. ": " end local tot = 1 + ulx.url1.extraurls local sand = displayheader for i=1,tot do local indx = "url" .. i displayfield = ulx[indx]["title"] if ulx.url1.format == "addlarchives" then if not displayfield then displayfield = ulx[indx]["date"] end else if not displayfield then displayfield = "Page " .. i end end sand = sand .. "[" .. ulx[indx]["url"] .. " " .. displayfield .. "]" if i == tot then sand = sand .. "." else sand = sand .. ", " end end return sand end end function p.webarchive(frame) args = frame.args if (args[1]==nil) and (args["url"]==nil) then -- if no argument provided than check parent template/module args args = frame:getParent().args end local tname = "Webarchive" -- name of calling template. Change if template rename. ulx = {} -- Associative array to hold template data track = {} -- Associative array to hold tracking categories maxurls = 10 -- Max number of URLs allowed. local verifydates = "yes" -- See documentation. Set "no" to disable. -- URL argument (first) local url1 = trimArg(args.url) or trimArg(args.url1) if not url1 then return inlineError("url", "Empty.") .. createTracking() end if mw.ustring.find( url1, "https://web.http", 1, plain ) then -- track bug track["Category:Webarchive template errors"] = 1 return inlineError("url", "https://web.http") .. createTracking() end if url1 == "https://web.archive.org/http:/" then -- track bug track["Category:Webarchive template errors"] = 1 return inlineError("url", "Invalid URL") .. createTracking() end ulx.url1 = {} ulx.url1.url = url1 local uri1 = mw.uri.new(ulx.url1.url) ulx.url1.host = uri1.host ulx.url1.extraurls = parseExtraArgs() -- Nolink argument local nolink = trimArg2(args.nolink) serviceName(uri1.host, nolink) -- Date argument local date = trimArg(args.date) or trimArg(args.date1) if date == "*" and ulx.url1.service == "wayback" then date = "index" elseif date and ulx.url1.service == "wayback" and verifydates == "yes" then local ldf = dateFormat(date) if ldf then local udate = decodeWaybackDate( uri1.path, ldf ) if udate ~= date then date = udate .. inlineRed("<sup>[Date mismatch]</sup>", "warning") end end elseif date and ulx.url1.service == "webcite" and verifydates == "yes" then local ldf = dateFormat(date) if ldf then local udate = decodeWebciteDate( uri1.path, ldf ) if udate == "query" then -- skip elseif udate ~= date then date = udate .. inlineRed("<sup>[Date mismatch]</sup>", "warning") end end elseif date and ulx.url1.service == "archiveis" and verifydates == "yes" then local ldf = dateFormat(date) if ldf then local udate = decodeArchiveisDate( uri1.path, ldf ) if udate == "short link" then -- skip elseif udate ~= date then date = udate .. inlineRed("<sup>[Date mismatch]</sup>", "warning") end end elseif not date and ulx.url1.service == "wayback" then date = decodeWaybackDate( uri1.path, "iso" ) if not date then date = inlineRed("[Date error] (1)", "error") end elseif not date and ulx.url1.service == "webcite" then date = decodeWebciteDate( uri1.path, "iso" ) if date == "query" then date = inlineRed("[Date missing]", "warning") elseif not date then date = inlineRed("[Date error] (1)", "error") end elseif not date and ulx.url1.service == "archiveis" then date = decodeArchiveisDate( uri1.path, "iso" ) if date == "short link" then date = inlineRed("[Date missing]", "warning") elseif not date then date = inlineRed("[Date error] (1)", "error") end elseif not date then date = inlineRed("[Date missing]", "warning") end ulx.url1.date = date -- Format argument local format = trimArg(args.format) if not format then format = "none" else if format == "addlpages" then if not ulx.url1.date then format = "none" end elseif format == "addlarchives" then format = "addlarchives" else format = "none" end end ulx.url1.format = format -- Title argument local title = trimArg(args.title) or trimArg(args.title1) ulx.url1.title = title local rend = createRendering() if not rend then rend = '<span style="font-size:100%" class="error citation-comment">Error in [[:Template:' .. tname .. ']]: Unknown problem. Please report on template talk page.</span>' track["Category:Webarchive template errors"] = 1 end return rend .. createTracking() end return p