diff options
Diffstat (limited to 'lib/gorg/cache.rb')
-rw-r--r-- | lib/gorg/cache.rb | 493 |
1 files changed, 493 insertions, 0 deletions
diff --git a/lib/gorg/cache.rb b/lib/gorg/cache.rb new file mode 100644 index 0000000..543b6a2 --- /dev/null +++ b/lib/gorg/cache.rb @@ -0,0 +1,493 @@ +### Copyright 2004, Xavier Neys (neysx@gentoo.org) +# # +# # This file is part of gorg. +# # +# # gorg is free software; you can redistribute it and/or modify +# # it under the terms of the GNU General Public License as published by +# # the Free Software Foundation; either version 2 of the License, or +# # (at your option) any later version. +# # +# # gorg is distributed in the hope that it will be useful, +# # but WITHOUT ANY WARRANTY; without even the implied warranty of +# # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# # GNU General Public License for more details. +# # +# # You should have received a copy of the GNU General Public License +# # along with Foobar; if not, write to the Free Software +### Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +# Cache a bit of data based on +# . a path name as received by a webserver e.g. +# . a list of parameters as received by a webserver e.g. +# . a list of files it depends on + +require "parsedate" +require "fileutils" +require "find" +require "digest" +require "digest/md5" + +module Gorg + +CacheStamp = "Gorg-#{Gorg::Version} Cached This Data. Do not alter this file. Thanks." + +module Cache + def Cache.init(config) + @@lockfile = ".cache.cleaner.lock" + @cacheDir = nil + if FileTest.directory?(config["cacheDir"]) + if FileTest.writable?(config["cacheDir"]) + @cacheDir = config["cacheDir"].chomp("/") + else + warn "Cache directory not writable" + end + else + warn "Invalid cache directory" + end + + # Time-To-Live in seconds, cached items older than that will be considered too old + @zipLevel = config["zipLevel"] + @zip = @zipLevel > 0 ? ".gz" : "" + @ttl = config["cacheTTL"] + @cacheTree = config["cacheTree"] + @maxFiles = config["maxFiles"] # Max number of files in a single directory + @maxSize = config["cacheSize"]*1024*1024 # Now in bytes + @washNumber = config["cacheWash"] # Clean cache dir after a store operation whenever rand(@washNumber) < 10 + @lastCleanup = Time.new-8e8 # Remember last time we started a cleanup so we don't pile them up + end + + def Cache.hit(objPath, objParam={}, etags=nil, ifmodsince=nil) + # objPath is typically a requested path passed from a web request but it + # can be just any string. It is not checked against any actual files on the file system + # + # objParam is expected to be a hash or any object whose iterator yields two values + # + # 2 filenames are built with the arguments and should give + # the name of a metafile and a result file + # if the result file is older than @ttl seconds, hit fails + # The metafile is then checked for dependencies + # It contains a list of filenames along with their size and mtime separated by ;; + + # etag and ifmodsince are used in a webserver context + # etag is defined if an ETag was part of an If-None-Match request field + # etag can be an array or a single string + # If the current ETag of the meta file matches, no data is returned (webserver should return a 304) + # + # ifmodsince is a time object passed on an If-Modified-Since request field + # If the creation date of the meta file is earlier, no data is returned (webserver should return a 304) + + return nil if @cacheDir.nil? # Not initialized, ignore request + + # Reminder: filenames are full path, no need to prepend dirname + dirname, basename, filename, metaname = makeNames(objPath, objParam) + + raise "Cache subdir does not exist" unless FileTest.directory?(dirname) + + # Hit the cache + meta, mstat = IO.read(metaname), File.stat(metaname) if metaname && FileTest.file?(metaname) && FileTest.readable?(metaname) + raise "Empty/No meta file" if meta.nil? || meta.length < 1 + + fstat = File.stat(filename) if filename && FileTest.file?(filename) + raise "Empty/No data file" if fstat.nil? + + # Check the timestamps of files in the metadata + meta = meta.split("\n") + raise "I did not write that meta file" unless CacheStamp == meta.shift + mline = meta.shift + while mline and mline !~ /^;;extra meta$/ do + f, s, d = mline.split(";;") + if s.to_i < 0 + # File did not exist when cache entry was created + raise "Required file #{f} has (re)appeared" if FileTest.file?(f) && FileTest.readable?(f) + else + # File did exist when cache entry was created, is it still there? + raise "Required file #{f} has disappeared" unless FileTest.file?(f) && FileTest.readable?(f) + + fst = File.stat(f) + raise "Size of #{f} has changed from #{fst.size} to #{s.to_i}" unless fst.size == s.to_i + raise "Timestamp of #{f} has changed" unless Time.utc(*ParseDate.parsedate(d)) == fst.mtime.utc + end + mline = meta.shift + end + if mline =~ /^;;extra meta$/ then + extrameta = meta.dup + else + extrameta = [] + end + + if notModified?(fstat, etags, ifmodsince) and extrameta.join !~ /set-cookie/i + raise Gorg::Status::NotModified.new(fstat) + end + + file = IO.read(filename) if filename && FileTest.file?(filename) && FileTest.readable?(filename) + raise "Empty/No data file" if file.nil? || file.length < 1 + + # Is the data file too old + raise "Data file too old" unless @ttl==0 or (Time.new - fstat.mtime) < @ttl + + # Update atime of files, ignore failures as files might have just been removed + begin + t = Time.new + File.utime(t, fstat.mtime, filename) + File.utime(t, mstat.mtime, metaname) + rescue + nil + end + + # If we get here, it means the data file can be used, return cache object (data, stat(datafile), extrameta) + # The file is left (un)compressed, it's returned as it was stored + [file, fstat, extrameta] + + rescue Gorg::Status::NotModified + # Nothing changed, should return a 304 + debug("Client cache is up-to-date") + raise + rescue + # cache hit fails if anything goes wrong, no exception raised + debug("Cache hit on #{objPath} failed: (#{$!})") + nil + end + + + def Cache.store(data, objPath, objParam={}, deps=[], extrameta=[]) + # Store data in cache so it can be retrieved based on the objPath and objParams + # deps should contain a list of files that the object depends on + # as returnd by our xsl processor, i.e. an array of [access_type, path] where + # access_type can be "r", "w", or "o" for recpectively read, write, other. + + # Define content-type + ct = setContentType(data) + extrameta << "Content-Type:#{ct}" + + return nil if @cacheDir.nil? # Not initialized, ignore request + + # Cache only if no remote objects (ftp:// or http://) in list of used files + if deps && deps.detect{|f| f[0] =~ /^o$/i } + debug "#{objPath} not cached because it needs remote resources" + return nil + end + + dirname, basename, filename, metaname = makeNames(objPath, objParam) + + FileUtils.mkdir_p(dirname) unless FileTest.directory?(dirname) + + # Write Meta file to a temp file (with .timestamp.randomNumber appended) + metaname_t = "#{metaname}.#{Time.new.strftime('%Y%m%d%H%M%S')}.#{rand(9999)}" + + # Data might need to be just a link to another .Data file + # if we find another requested path with different params but + # with identical MD5 sums + # Which is why we keep a ...xml.Data.[md5 sum] file without the parameters + # in its name that we can hard link to. + # e.g. A moron hits for 10 full handbooks with toto=1..10 in the URI, + # we'd end up with 10 identical large copies. With links we have only one + + # Old versions are expected to be cleaned up by the cacheWash() routine + # A Dir.glob() to find the previous ones would be too expensive + + # Compute MD5 digest + md5 = Digest::MD5.hexdigest(data) + + # Compress data if required + if @zipLevel > 0 then + bodyZ = data = gzip(data, @zipLevel) + else + bodyZ = nil + end + + # Set mtime of data file to latest mtime of all required files + # so that caching can work better because mtimes will be + # identical on all webnodes whereas creation date of data + # would be different on all nodes. + maxmtime = Time.now-8e8 + fstat = nil + + begin + timeout(10){ + File.open("#{metaname_t}", "w") {|fmeta| + fmeta.puts(CacheStamp) + # Write filename;;size;;mtime for each file in deps[] + deps.each {|ffe| + ftype = ffe[0] + fdep = ffe[1] + if FileTest.file?(fdep) + s = File.stat(fdep) + fmeta.puts("#{fdep};;#{s.size};;#{s.mtime.utc};;#{ftype}") + maxmtime = s.mtime if s.mtime > maxmtime and ftype =~ /^r$/i + else + # A required file does not exist, use size=-1 and old timestamp + # so that when the file comes back, the cache notices a difference + # and no cache miss gets triggered as long as file does not exist + fmeta.puts("#{fdep};;-1;;Thu Nov 11 11:11:11 UTC 1971") + end + } + fmeta.puts ";;extra meta" + extrameta.each { |m| fmeta.puts m } + } + # Get exclusive access to the cache directory while moving files and/or creating data files + File.open(dirname) { |lockd| + while not lockd.flock(File::LOCK_NB|File::LOCK_EX) + # Timeout does not occur on a blocking lock + # Try a non-bloking one repeatedly for a few seconds until timeout occurs or lock is granted + # We are in a timeout block, remember + sleep 0.1 + end + # Remove previous Data + FileUtils.rm_rf(filename) + + # mv temp meta file to meta file + FileUtils.mv(metaname_t, metaname) + + # We keep a data file for the same requested path, with different params, + # but which ends up with same MD5 sum, i.e. identical results because of unused params + linkname = "#{basename}.#{md5}#{@zip}" + if FileTest.file?(linkname) then + # Data file already there, link to it + File.link(linkname, filename) + else + # Write data file and set its mtime to latest of all files it depends on + File.open("#{filename}", "w") {|fdata| fdata.write(data)} + # Create link + File.link(filename, linkname) + end + # mtime might need to be updated, or needs to be set + # e.g. when a dependency had changed but result files is identical + # This is needed to keep Last-Modified dates consistent across web nodes + File.utime(Time.now, maxmtime, filename) + fstat = File.stat(filename) + } + } + ensure + FileUtils.rm_rf(metaname_t) + end + + # Do we clean the cache? + washCache(dirname, 10) if @washNumber > 0 and rand(@washNumber) < 10 + + # Return stat(datafile) even if it's just been removed by washCache + # because another web node might still have it or will have it. + # Anyway, the cached item would be regenerated on a later request + # and a 304 would be returned if still appropriate at the time. + + # Return fstat of data file (for etag...) and zipped file + [fstat, bodyZ] + + rescue Timeout::Error, StandardError =>ex + if ex.class.to_s =~ /timeout::error/i then + warn("Timeout in cache store operation") + else + warn("Cache store error (#{$!})") + end + # Clean up before leaving + FileUtils.rm_rf(filename||"") + FileUtils.rm_rf(metaname||"") + nil # return nil so that caller can act if a failed store really is a problem + end + + + def Cache.washCache(dirname, tmout=30, cleanTree=false) + # Clean cache entries that are either too old compared to TTL (in seconds) + # or reduce total size to maxSize (in MB) + # oldDataOnly means to look only for unused *.Data.[md5] files that are not used anymore + # because file has been modified and has generated a new *.Data.[md5] file + + # timeout is the maximum time (in seconds) spent in here + + return nil if @cacheDir.nil? # Not initialized, ignore request + + # Also ignore request if dirname not equal to @cacheDir or under it + return nil unless dirname[0, @cacheDir.length] == @cacheDir + + # Also ignore request if dirname does not exist yet + return nil unless FileTest.directory?(dirname) + + # Also return if less than a minute has elapsed since latest cleanup + t0 = Time.new + return nil if t0 - @lastCleanup < 60 + + # Remember for next time + @lastCleanup = t0 + + Dir.chdir(dirname) { |d| + # Recreate lock file if it's been lost + unless File.exist?(@@lockfile) + File.open(@@lockfile, "w") { |lockf| lockf.puts("Lock file created on #{Time.now.utc} by gorg")} + end + + # Grab lockfile + File.open(@@lockfile) { |lockf| + if lockf.flock(File::LOCK_NB|File::LOCK_EX) then + infoMsg = "Cleaning up cache in #{dirname} (cleanTree=#{cleanTree}, tmout=#{tmout})" + info(infoMsg) + puts infoMsg if cleanTree + + timeout(tmout) { + totalSize, deletedFiles, scannedDirectories = washDir(dirname, cleanTree) + if totalSize >= 0 then + # Size == -1 means dir was locked, throwing an exception would have been nice :) + infoMsg = if cleanTree then + "Cache in #{dirname} is now #{totalSize/1024/1024} MB, #{deletedFiles} files removed in #{(Time.now-t0).to_i} seconds in #{scannedDirectories} directories" + else + "#{deletedFiles} files removed in #{(Time.now-t0).to_i} seconds in #{dirname}" + end + info(infoMsg) + puts infoMsg if cleanTree + end + } + else + # Locked dir, another process is busy cleaning up/ + debug("#{dirname} locked, skipping") + puts("#{dirname} locked, skipping") if cleanTree + end # of lock test + } # end of File.open(@@lockfile), close & release lock automatically + } + rescue Timeout::Error + info("Timeout while cleaning #{dirname}") + puts("Timeout while cleaning #{dirname}") if cleanTree + rescue StandardError =>ex + error("Error while cleaning cache: #{ex}") + puts("Error while cleaning cache: #{ex}") if cleanTree + end + + + private + + def Cache.washDir(dirname, cleanTree) + # Clean up cache starting from dirname and in subdirectories if cleanTree is true + # Return [newSize in bytes, # deleted files, # scanned directories] + size = nDeleted = nDirectories = 0 + + Dir.chdir(dirname) { |d| + hIno = Hash.new(0) # hash of file inodes with more than one link + lst = Array.new # array of file names, atime, ... + ttl = @ttl + ttl = 8e8 if ttl == 0 # No ttl, keep very old docs! + + # Get list of files sorted on their dirname+atime + Find.find('.') { |f| + begin + unless f =~ /^\.$|#{@@lockfile}/ # ignore "." and lockfile + ff = File.stat(f) + if ff.directory? then + Find.prune unless cleanTree + elsif ff.file? and f =~ /Meta|Data/ then + hIno[ff.ino] = ff.nlink if ff.nlink > 1 + # List of files has [name, atime, size, # links, inode] + lst << [f, ff.atime, ff.size, ff.nlink, ff.ino] + end + end + rescue + nil # File.stat can fail because file could have been deleted, ignore error + end + } + + # Compute total size + size = lst.inject(0){ |tot, a| tot + if a[3] > 0 then a[2]/a[3] else 0 end } + + # Delete old *.Data.[md5] files that are not being referenced anymore/ + lst.each { |a| + if a[3] == 1 && a[0] =~ /\.Data\.[0-9a-f]+(.gz)?$/ then + # Data file with no more links pointing to it + FileUtils.rm_rf(a[0]) + nDeleted += 1 + size -= a[2] + a[3] = 0 # Mark as deleted + end + } + + # Sort all files on atime + lst.sort!{ |a1, a2| a1[1] <=> a2[1] } + + t0 = Time.new + # Clean until size < maxSize _AND_ atime more recent than TTL + lst.each { |a| + break if size < @maxSize and t0-a[1] < ttl + next if a[3] < 1 # Already deleted in previous step + FileUtils.rm_rf(a[0]) + nDeleted += 1 + # Total size -= file size IF last link to data + if a[3] == 1 || hIno[a[4]] <= 1 then + size -= a[2] + end + hIno[a[4]] -= 1 if hIno[a[4]] > 0 + a[3] = 0 # Mark as deleted by setting nlinks to 0 + } + + # Remove deleted files from array + lst.reject! { |a| a[3] < 1 } + + + # Sort files per directory to enforce maxFiles + if cleanTree then + # Split the array in an array per directory + # and keep the files sorted on atime in each directory + slst = Hash.new + lst.length.times { + a = lst.shift + d = File.dirname(a[0]) + if slst[d] then + slst[d] << a + else + slst[d] = [a] + end + } + else + # If not cleaning whole tree, we have only a single dir + slst = {"." => lst} + end + + nDirectories = slst.length + + slst.each { |d, lst| + # Remove oldest files so that we have less than @maxFiles in it + if lst.length >= @maxFiles then + # Remove to leave up 90% of #maxFiles so we don't clean up only a handful of files repeatedly + (lst.length - 9*@maxFiles/10).times { + if a = lst.shift then + FileUtils.rm_rf(a[0]) + nDeleted += 1 + # Total size -= file size IF last link to data + if a[3] == 1 || hIno[a[4]] <= 1 then + size -= a[2] + end + hIno[a[4]] -= 1 if hIno[a[4]] > 0 + end + } + end + } + } #end of chdir + [size, nDeleted, nDirectories] + end + + + def Cache.makeNames(obj, params) + # Build meta filename and data filename from arguments + # + # obj is broken into a path and a filename with appended params + # e.g. /proj/en/index.xml?style=printable becomes /proj/en and index.xml+printable+yes + # or .#proj#en#index.xml+printable+yes + # depending on cacheTree param value + + # .Meta and .Data are appended respectively to the meta filename and data filename + # Base is the filename without appending params, e.g. .#proj#en#index.xml.Data + if @cacheTree then + # Use a path and a file + dir = "#{@cacheDir}#{File.dirname(obj)}" + base = f = File.basename(obj) + else + # Convert full path into a single filename + dir = @cacheDir + base = f = ".#{obj.gsub(/\//,'#')}" + end + + f = "#{f}+#{params.reject{|k,v| k.nil?}.sort.join('+')}" if params && params.to_a.length > 0 + # Remove funky chars and squeeze duplicates into single chars + f = f.gsub(/[^\w\#.+_-]/, "~").squeeze("~.#+") + + # Return names for Data and Meta files, and just the filepath (e.g. #proj#en#index.xml) + [dir, "#{dir}/#{base}.Data", "#{dir}/#{f}.Data#{@zip}", "#{dir}/#{f}.Meta"] + end +end + +end |