###   Copyright 2004,   Xavier Neys   (neysx@gentoo.org)
# #
# #   This file is part of gorg.
# #
# #   gorg is free software; you can redistribute it and/or modify
# #   it under the terms of the GNU General Public License as published by
# #   the Free Software Foundation; either version 2 of the License, or
# #   (at your option) any later version.
# #
# #   gorg is distributed in the hope that it will be useful,
# #   but WITHOUT ANY WARRANTY; without even the implied warranty of
# #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# #   GNU General Public License for more details.
# #
# #   You should have received a copy of the GNU General Public License
# #   along with Foobar; if not, write to the Free Software
###   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA


# Cache a bit of data based on 
#  . a path name as received by a webserver e.g.
#  . a list of parameters as received by a webserver e.g.
#  . a list of files it depends on

begin
  require "parsedate"
  $haveparsedate = true
rescue LoadError
  require "time"
  $haveparsedate = false
end
require "fileutils"
require "find"
require "digest"
require "digest/md5"

module Gorg

CacheStamp = "Gorg-#{Gorg::Version} Cached This Data. Do not alter this file. Thanks."

module Cache
  def Cache.init(config)
    @@lockfile = ".cache.cleaner.lock"
    @cacheDir = nil
    if FileTest.directory?(config["cacheDir"])
      if FileTest.writable?(config["cacheDir"])
        @cacheDir = config["cacheDir"].chomp("/")
      else
        warn "Cache directory not writable"
      end
    else
      warn "Invalid cache directory"
    end

    # Time-To-Live in seconds, cached items older than that will be considered too old
    @zipLevel = config["zipLevel"]
    @zip = @zipLevel > 0 ? ".gz" : ""
    @ttl = config["cacheTTL"]
    @cacheTree = config["cacheTree"]
    @maxFiles = config["maxFiles"]            # Max number of files in a single directory
    @maxSize = config["cacheSize"]*1024*1024  # Now in bytes
    @washNumber = config["cacheWash"]         # Clean cache dir after a store operation whenever rand(@washNumber) < 10
    @lastCleanup = Time.new-8e8               # Remember last time we started a cleanup so we don't pile them up
  end
  
  def Cache.hit(objPath, objParam={}, etags=nil, ifmodsince=nil)
    # objPath is typically a requested path passed from a web request but it
    # can be just any string. It is not checked against any actual files on the file system
    #
    # objParam is expected to be a hash or any object whose iterator yields two values
    #
    # 2 filenames are built with the arguments and should give 
    # the name of a metafile and a result file
    # if the result file is older than @ttl seconds, hit fails
    # The metafile is then checked for dependencies
    # It contains a list of filenames along with their size and mtime separated by ;;
    
    # etag and ifmodsince are used in a webserver context
    #   etag is defined if an ETag was part of an If-None-Match request field
    #   etag can be an array or a single string
    #   If the current ETag of the meta file matches, no data is returned (webserver should return a 304)
    #
    #   ifmodsince is a time object passed on an If-Modified-Since request field
    #   If the creation date of the meta file is earlier, no data is returned (webserver should return a 304)

    return nil if @cacheDir.nil? # Not initialized, ignore request
    
    # Reminder: filenames are full path, no need to prepend dirname
    dirname, basename, filename, metaname = makeNames(objPath, objParam)
    
    raise "Cache subdir does not exist" unless FileTest.directory?(dirname)

    # Hit the cache
    meta, mstat = IO.read(metaname), File.stat(metaname)  if metaname && FileTest.file?(metaname) && FileTest.readable?(metaname)
    raise "Empty/No meta file" if meta.nil? || meta.length < 1

    fstat = File.stat(filename) if filename && FileTest.file?(filename)
    raise "Empty/No data file" if fstat.nil?

    # Check the timestamps of files in the metadata
    meta = meta.split("\n")
    raise "I did not write that meta file" unless CacheStamp == meta.shift
    mline = meta.shift
    while mline and mline !~ /^;;extra meta$/ do
      f, s, d = mline.split(";;")
      if s.to_i < 0
        # File did not exist when cache entry was created
        raise "Required file #{f} has (re)appeared" if FileTest.file?(f) && FileTest.readable?(f)
      else
        # File did exist when cache entry was created, is it still there?
        raise "Required file #{f} has disappeared" unless FileTest.file?(f) && FileTest.readable?(f)
      
        fst = File.stat(f)
        raise "Size of #{f} has changed from #{fst.size} to #{s.to_i}" unless fst.size == s.to_i
        if $haveparsedate
          raise "Timestamp of #{f} has changed" unless Time.utc(*ParseDate.parsedate(d)) == fst.mtime.utc
        else
          raise "Timestamp of #{f} has changed" unless Time.parse(d) == fst.mtime.utc
        end
      end
      mline = meta.shift
    end
    if mline =~ /^;;extra meta$/ then
      extrameta = meta.dup
    else
      extrameta = []
    end
    
    if notModified?(fstat, etags, ifmodsince) and extrameta.join !~ /set-cookie/i
      raise Gorg::Status::NotModified.new(fstat)
    end
    
    file = IO.read(filename) if filename && FileTest.file?(filename) && FileTest.readable?(filename)
    raise "Empty/No data file" if file.nil? || file.length < 1

    # Is the data file too old
    raise "Data file too old" unless @ttl==0 or (Time.new - fstat.mtime) < @ttl
    
    # Update atime of files, ignore failures as files might have just been removed
    begin
      t = Time.new
      File.utime(t, fstat.mtime, filename)
      File.utime(t, mstat.mtime, metaname)
    rescue
      nil
    end
    
    # If we get here, it means the data file can be used, return cache object (data, stat(datafile), extrameta)
    # The file is left (un)compressed, it's returned as it was stored
    [file, fstat, extrameta]
    
  rescue Gorg::Status::NotModified
    # Nothing changed, should return a 304
    debug("Client cache is up-to-date")
    raise
  rescue
    # cache hit fails if anything goes wrong, no exception raised
    debug("Cache hit on #{objPath} failed: (#{$!})")
    nil
  end


  def Cache.store(data, objPath, objParam={}, deps=[], extrameta=[])
    # Store data in cache so it can be retrieved based on the objPath and objParams
    # deps should contain a list of files that the object depends on
    # as returnd by our xsl processor, i.e. an array of [access_type, path] where
    # access_type can be "r", "w", or "o" for recpectively read, write, other.

    # Define content-type
    ct = setContentType(data)
    extrameta << "Content-Type:#{ct}"
    
    return nil if @cacheDir.nil? # Not initialized, ignore request
    
    # Cache only if no remote objects (ftp:// or http://) in list of used files
    if deps && deps.detect{|f| f[0] =~ /^o$/i }
      debug "#{objPath} not cached because it needs remote resources"
      return nil
    end

    dirname, basename, filename, metaname = makeNames(objPath, objParam)

    FileUtils.mkdir_p(dirname) unless FileTest.directory?(dirname)
    
    # Write Meta file to a temp file (with .timestamp.randomNumber appended)
    metaname_t = "#{metaname}.#{Time.new.strftime('%Y%m%d%H%M%S')}.#{rand(9999)}"

    # Data might need to be just a link to another .Data file
    # if we find another requested path with different params but
    # with identical MD5 sums
    # Which is why we keep a ...xml.Data.[md5 sum] file without the parameters
    # in its name that we can hard link to.
    # e.g. A moron hits for 10 full handbooks with toto=1..10 in the URI,
    # we'd end up with 10 identical large copies. With links we have only one

    # Old versions are expected to be cleaned up by the cacheWash() routine
    # A Dir.glob() to find the previous ones would be too expensive
    
    # Compute MD5 digest
    md5 = Digest::MD5.hexdigest(data)
    
    # Compress data if required
    if @zipLevel > 0 then
      bodyZ = data = gzip(data, @zipLevel)
    else
      bodyZ = nil
    end
    
    # Set mtime of data file to latest mtime of all required files
    # so that caching can work better because mtimes will be
    # identical on all webnodes whereas creation date of data
    # would be different on all nodes.
    maxmtime = Time.now-8e8
    fstat = nil
    
    begin
      timeout(10){
        File.open("#{metaname_t}", "w") {|fmeta|
          fmeta.puts(CacheStamp)
          # Write filename;;size;;mtime for each file in deps[]
          deps.each {|ffe|
            ftype = ffe[0]
            fdep = ffe[1]
            if FileTest.file?(fdep)
              s = File.stat(fdep)
              fmeta.puts("#{fdep};;#{s.size};;#{s.mtime.utc};;#{ftype}")
              maxmtime = s.mtime if s.mtime > maxmtime and ftype =~ /^r$/i
            else
              # A required file does not exist, use size=-1 and old timestamp
              # so that when the file comes back, the cache notices a difference
              # and no cache miss gets triggered as long as file does not exist
              fmeta.puts("#{fdep};;-1;;Thu Nov 11 11:11:11 UTC 1971")
            end
          }
          fmeta.puts ";;extra meta"
          extrameta.each { |m| fmeta.puts m }
        }
        # Get exclusive access to the cache directory while moving files and/or creating data files
        File.open(dirname) { |lockd|
          while not lockd.flock(File::LOCK_NB|File::LOCK_EX)
            # Timeout does not occur on a blocking lock
            # Try a non-bloking one repeatedly for a few seconds until timeout occurs or lock is granted
            # We are in a timeout block, remember
            sleep 0.1
          end
          # Remove previous Data
          FileUtils.rm_rf(filename)

          # mv temp meta file to meta file
          FileUtils.mv(metaname_t, metaname)

          # We keep a data file for the same requested path, with different params,
          # but which ends up with same MD5 sum, i.e. identical results because of unused params
          linkname = "#{basename}.#{md5}#{@zip}"
          if FileTest.file?(linkname) then
            # Data file already there, link to it
            File.link(linkname, filename)
          else
            # Write data file and set its mtime to latest of all files it depends on
            File.open("#{filename}", "w") {|fdata| fdata.write(data)}
            # Create link
            File.link(filename, linkname)
          end
          # mtime might need to be updated, or needs to be set
          # e.g. when a dependency had changed but result files is identical
          # This is needed to keep Last-Modified dates consistent across web nodes
          File.utime(Time.now, maxmtime, filename)
          fstat = File.stat(filename)
        }
      }
    ensure
      FileUtils.rm_rf(metaname_t)
    end
    
    # Do we clean the cache?
    washCache(dirname, 10) if @washNumber > 0 and rand(@washNumber) < 10
    
    # Return stat(datafile) even if it's just been removed by washCache
    # because another web node might still have it or will have it.
    # Anyway, the cached item would be regenerated on a later request
    # and a 304 would be returned if still appropriate at the time.

    # Return fstat of data file (for etag...) and zipped file
    [fstat, bodyZ]
    
  rescue Timeout::Error, StandardError =>ex
    if ex.class.to_s =~ /timeout::error/i then
      warn("Timeout in cache store operation")
    else
      warn("Cache store error (#{$!})")
    end
    # Clean up before leaving
    FileUtils.rm_rf(filename||"")
    FileUtils.rm_rf(metaname||"")
    nil # return nil so that caller can act if a failed store really is a problem
  end
    
    
  def Cache.washCache(dirname, tmout=30, cleanTree=false)
    # Clean cache entries that are either too old compared to TTL (in seconds)
    # or reduce total size to maxSize (in MB)
    # oldDataOnly means to look only for unused *.Data.[md5] files that are not used anymore 
    # because file has been modified and has generated a new *.Data.[md5] file
    
    # timeout is the maximum time (in seconds) spent in here

    return nil if @cacheDir.nil? # Not initialized, ignore request
    
    # Also ignore request if dirname not equal to @cacheDir or under it
    return nil unless dirname[0, @cacheDir.length] == @cacheDir
    
    # Also ignore request if dirname does not exist yet
    return nil unless FileTest.directory?(dirname)
    
    # Also return if less than a minute has elapsed since latest cleanup
    t0 = Time.new
    return nil if t0 - @lastCleanup < 60
    
    # Remember for next time
    @lastCleanup = t0

    Dir.chdir(dirname) { |d|
      # Recreate lock file if it's been lost
      unless File.exist?(@@lockfile)
        File.open(@@lockfile, "w") { |lockf| lockf.puts("Lock file created on #{Time.now.utc} by gorg")}
      end
        
      # Grab lockfile
      File.open(@@lockfile) { |lockf| 
        if lockf.flock(File::LOCK_NB|File::LOCK_EX) then
          infoMsg = "Cleaning up cache in #{dirname} (cleanTree=#{cleanTree}, tmout=#{tmout})"
          info(infoMsg)
          puts infoMsg if cleanTree

          timeout(tmout) {
            totalSize, deletedFiles, scannedDirectories = washDir(dirname, cleanTree)
            if totalSize >= 0 then
              # Size == -1 means dir was locked, throwing an exception would have been nice :)
              infoMsg = if cleanTree then
                          "Cache in #{dirname} is now #{totalSize/1024/1024} MB, #{deletedFiles} files removed in #{(Time.now-t0).to_i} seconds in #{scannedDirectories} directories"
                        else
                          "#{deletedFiles} files removed in #{(Time.now-t0).to_i} seconds in #{dirname}"
                        end
              info(infoMsg)
              puts infoMsg if cleanTree
            end
          }
        else
          # Locked dir, another process is busy cleaning up/
          debug("#{dirname} locked, skipping")
          puts("#{dirname} locked, skipping") if cleanTree
        end # of lock test
      } # end of File.open(@@lockfile),  close & release lock automatically
    }
  rescue Timeout::Error
    info("Timeout while cleaning #{dirname}")
    puts("Timeout while cleaning #{dirname}") if cleanTree
  rescue StandardError =>ex
    error("Error while cleaning cache: #{ex}")
    puts("Error while cleaning cache: #{ex}") if cleanTree
  end

  
  private

  def Cache.washDir(dirname, cleanTree)
    # Clean up cache starting from dirname and in subdirectories if cleanTree is true
    # Return [newSize in bytes, # deleted files, # scanned directories]
    size = nDeleted = nDirectories = 0

    Dir.chdir(dirname) { |d|
      hIno = Hash.new(0) # hash of file inodes with more than one link
      lst = Array.new    # array of file names, atime, ...
      ttl = @ttl
      ttl = 8e8 if ttl == 0 # No ttl, keep very old docs!

      # Get list of files sorted on their dirname+atime
      Find.find('.') { |f|
        begin
          unless f =~ /^\.$|#{@@lockfile}/  # ignore "." and lockfile 
            ff = File.stat(f)
            if ff.directory? then
              Find.prune unless cleanTree
            elsif ff.file? and f =~ /Meta|Data/ then
              hIno[ff.ino] = ff.nlink if ff.nlink > 1
              # List of files has [name, atime, size, # links, inode]
              lst << [f, ff.atime, ff.size, ff.nlink, ff.ino]
            end
          end
        rescue
          nil # File.stat can fail because file could have been deleted, ignore error
        end
      }
      
      # Compute total size
      size = lst.inject(0){ |tot, a| tot + if a[3] > 0 then a[2]/a[3] else 0 end }
      
      # Delete old *.Data.[md5] files that are not being referenced anymore/
      lst.each { |a|
        if a[3] == 1 && a[0] =~ /\.Data\.[0-9a-f]+(.gz)?$/ then
          # Data file with no more links pointing to it
          FileUtils.rm_rf(a[0])
          nDeleted += 1
          size -= a[2]
          a[3] = 0 # Mark as deleted
        end
      }
      
      # Sort all files on atime
      lst.sort!{ |a1, a2| a1[1] <=> a2[1] }
      
      t0 = Time.new
      # Clean until size < maxSize _AND_ atime more recent than TTL
      lst.each { |a|
        break if size < @maxSize and t0-a[1] < ttl
        next if a[3] < 1 # Already deleted in previous step
        FileUtils.rm_rf(a[0])
        nDeleted += 1
        # Total size -= file size IF last link to data
        if a[3] == 1 || hIno[a[4]] <= 1 then
          size -= a[2]
        end
        hIno[a[4]] -= 1 if hIno[a[4]] > 0
        a[3] = 0 # Mark as deleted by setting nlinks to 0
      }
      
      # Remove deleted files from array
      lst.reject! { |a| a[3] < 1 }
      
      
      # Sort files per directory to enforce maxFiles
      if cleanTree then
        # Split the array in an array per directory
        # and keep the files sorted on atime in each directory
        slst = Hash.new
        lst.length.times {
          a = lst.shift
          d = File.dirname(a[0])
          if slst[d] then
            slst[d] << a
          else
            slst[d] = [a]
          end
        }
      else
        # If not cleaning whole tree, we have only a single dir
        slst = {"." => lst}
      end
      
      nDirectories = slst.length

      slst.each { |d, lst|
        # Remove oldest files so that we have less than @maxFiles in it
        if lst.length >= @maxFiles then
          # Remove to leave up 90% of #maxFiles so we don't clean up only a handful of files repeatedly
          (lst.length - 9*@maxFiles/10).times {
            if a = lst.shift then
              FileUtils.rm_rf(a[0])
              nDeleted += 1
              # Total size -= file size IF last link to data
              if a[3] == 1 || hIno[a[4]] <= 1 then
                size -= a[2]
              end
              hIno[a[4]] -= 1 if hIno[a[4]] > 0
            end
          }
        end
      }
    } #end of chdir
    [size, nDeleted, nDirectories]
  end
                    
  
  def Cache.makeNames(obj, params)
    # Build meta filename and data filename from arguments
    #
    # obj is broken into a path and a filename with appended params
    # e.g. /proj/en/index.xml?style=printable becomes /proj/en and index.xml+printable+yes
    #  or  .#proj#en#index.xml+printable+yes
    # depending on cacheTree param value

    # .Meta and .Data are appended respectively to the meta filename and data filename
    # Base is the filename without appending params, e.g. .#proj#en#index.xml.Data
    if @cacheTree then
      # Use a path and a file
      dir = "#{@cacheDir}#{File.dirname(obj)}"
      base = f = File.basename(obj)
    else
      # Convert full path into a single filename
      dir = @cacheDir
      base = f = ".#{obj.gsub(/\//,'#')}"
    end

    f = "#{f}+#{params.reject{|k,v| k.nil?}.sort.join('+')}" if params && params.to_a.length > 0    
    # Remove funky chars and squeeze duplicates into single chars
    f = f.gsub(/[^\w\#.+_-]/, "~").squeeze("~.#+")
    
    # Return names for Data and Meta files, and just the filepath (e.g. #proj#en#index.xml)
    [dir, "#{dir}/#{base}.Data", "#{dir}/#{f}.Data#{@zip}", "#{dir}/#{f}.Meta"]
  end
end

end