wpylib/file/file_utils.py

#!/usr/bin/python
# $Id: file_utils.py,v 1.2 2010-09-27 19:54:29 wirawan Exp $
#
# pyqmc.utils.file_utils module
# File-manipulation utilities
#
# Wirawan Purwanto
# Created: 20090601
#
# Routines put here are commonly used in my own scripts.
# They are not necessarily suitable for general-purpose uses; evaluate
# your needs and see if they can them as well.
#
# 20090601: Created as pyqmc.utils.file_utils .
# 20100927: Moved to wpylib.file.file_utils .
#
"""
Common file-manipulation utilities.

This module is part of wpylib project.
"""

import bz2
import glob
import gzip
import os
import os.path
try:
  import subprocess
  has_subprocess = True
except:
  has_subprocess = False

try:
  import lzma
  has_lzma = True
except:
  try:
    from backports import lzma
    has_lzma = True
  except:
    has_lzma = False


from wpylib.sugar import is_iterable

class super_file(object):
  '''"Super-file" hack wrapper for a file-like object.
  Intended to allow extra capabilities to file-like iterators such as:
  * ability to push back text lines for the subsequent next() calls.
    This is to provide some level of rewinding in parsing text files.
  * what else?
  '''
  def __init__(self, obj):
    '''Creates a super_file wrapper around the "obj" object.'''
    self.obj = obj
    self.pushback = []
  def __iter__(self):
    return self
  def close(self):
    return self.obj.close()
  def flush(self):
    return self.obj.flush()
  def next(self):
    if len(self.pushback) > 0:
      return self.pushback.pop()
    else:
      return self.obj.next()
  def push(self, s):
    self.pushback.append(s)


def open_input_file(fname, superize=0):
  if fname.endswith(".bz2"):
    fobj = bz2.BZ2File(fname, "r")
  elif fname.endswith(".gz") or fname.endswith(".Z"):
    fobj = gzip.GzipFile(fname, "r")
  elif fname.endswith(".lzma"):
    # until lzma has a "standard" python module, we use "lzma" executable:
    if has_lzma:
      fobj = lzma.LZMAFile(fname, "r")
    else:
      from wpylib.shell_tools import is_executable_file
      lzma_exe = path_search(os.environ["PATH"].split(os.pathsep),
                             ("lzma", "xz"),
                             filetest=is_executable_file)
      if lzma_exe == None:
        raise IOError, "Cannot find lzma or xz executable file."
      if has_subprocess:
        px = subprocess.Popen((lzma_exe, "-dc", fname), stdout=subprocess.PIPE)
        fobj = px.stdout
      else:
        fobj = os.popen('" -dc "' + fname + '"', "r")
  elif fname.endswith(".xz"):
    # until lzma has a "standard" python module, we use "lzma" executable:
    if has_lzma:
      fobj = lzma.LZMAFile(fname, "r")
    elif has_subprocess:
      px = subprocess.Popen(("xz", "-dc", fname), stdout=subprocess.PIPE)
      fobj = px.stdout
    else:
      fobj = os.popen('xz -dc "' + fname + '"', "r")
  else:
    fobj = open(fname, "r")

  if superize:
    return super_file(fobj)
  else:
    return fobj


# Miscellaneous functions:
# - globbing
# - file searches and scans


def glob_files(filespec):
  '''Processes a glob string, or does nothing (pass-on only) if an iterable object
  (e.g. list or tuple) is already given.
  When globbing is done, the result is sorted for predictability.'''
  if getattr(filespec, "__iter__", False):
    return filespec # no re-sorting
  elif isinstance(filespec, basestring):
    return sorted(glob.glob(filespec))
  else:
    raise ValueError, "Don't know how to glob for an object of " + type(filespec)


def path_search(*specs, **opts):
  '''Generalized path search.
  Multiple paths can be specified for different parts of the sought filename,
  and the first file found is returned.

  Additional options:
  * pathsep="/"  -- path separator
  * filetest=os.path.isfile  -- filetest operator to be used
  * raise_error=False  -- do we want to raise an exception if the file
    is not found after all possible searches?
  '''
  path_join = os.path.join
  # FIXME: this can be extremely expensive!
  xspecs = []
  xlen = []
  xstride = []
  xtot = 1
  pathsep = opts.get("pathsep", "/")
  filetest = opts.get("filetest", os.path.isfile)

  for spec in specs:
    if not is_iterable(spec): # maybe a string?
      xspecs.append((spec,))
      xlen.append(1)
    else:
      xspecs.append(tuple([ x for x in spec ]))
      xlen.append(len(xspecs[-1]))
    xstride.append(xtot)
    xtot *= xlen[-1]

  for idx in xrange(xtot):
    idx0 = idx
    # Construct the filename based on the index: we reconstruct
    # the indices for all the parts given in the argument, then
    # concatenate them to get the full pathname
    s = ""
    for d in xrange(len(xspecs)-1,-1,-1):
      a = idx0 / xstride[d]
      if s == "":
        s = xspecs[d][a]
      else:
        s = xspecs[d][a] + pathsep + s
      idx0 = idx0 % xstride[d]
      #print a,
    #print s
    if filetest(s):
      return s

  if opts.get("raise_error", False):
    raise ValueError, "Cannot find file with specified combination"
  else:
    return None


def scan_directories(D, testdir):
  """Recursively scans a directory tree for candidate of
  relevant directories, where testdir(D,dirs,files)
  return a True boolean value.

  We will *not* follow symlinks.

  The testdir function must have this kind of prototype:

     testdir(D, dirs, files)

  where:

  - D (first positional argument) is the directory under consideration
  - dirs (named argument) is a list containing all subdirectory entries
    contained in D (symlinks or not).
  - files (named argument) is a list containing all non-subdirectory
    entries contained in D (other symlinks, files, pipes, sockets, etc).
  """
  rslt = []
  for (d, dirs, files) in os.walk(D, topdown=True):
    if testdir(d, dirs=dirs, files=files):
      rslt.append(d)
  return rslt


def untar(archive, subdir=None, verbose=None, files=[]):
  '''Extracts a TAR archive. The destination directory can be given; otherwise
  the files are extracted to the current directory.
  Assuming GNU tar which accepts -z and -j switches.
  LZMA compression is supported via lzma program.
  '''
  opts = [ 'tar' ]
  # Python doc says: "the arguments to the child process must start with the
  # name of the command being run"

  if subdir:
    opts += [ "-C", subdir ]

  if archive.endswith(".tar.bz2") or archive.endswith(".tbz2") or archive.endswith(".tbz"):
    opts.append("-j")
  elif archive.endswith(".tar.Z") or archive.endswith(".tar.gz") or archive.endswith(".tgz"):
    opts.append("-z")
  elif archive.endswith(".tar.lzma") or archive.endswith(".tza"):
    opts.append("--use-compress-program=lzma")

  if verbose:
    for i in xrange(verbose): opts.append("-v")

  opts += [ "-xf", archive ]
  opts += files

  return os.spawnvp(os.P_WAIT, "tar", opts)
* Moving pyqmc.utils.file_utils to wpylib project. 14 years ago			`#!/usr/bin/python`
* Update documentations. 14 years ago			`# $Id: file_utils.py,v 1.2 2010-09-27 19:54:29 wirawan Exp $`
* Moving pyqmc.utils.file_utils to wpylib project. 14 years ago			`#`
			`# pyqmc.utils.file_utils module`
			`# File-manipulation utilities`
			`#`
			`# Wirawan Purwanto`
			`# Created: 20090601`
			`#`
			`# Routines put here are commonly used in my own scripts.`
			`# They are not necessarily suitable for general-purpose uses; evaluate`
			`# your needs and see if they can them as well.`
			`#`
* Update documentations. 14 years ago			`# 20090601: Created as pyqmc.utils.file_utils .`
			`# 20100927: Moved to wpylib.file.file_utils .`
			`#`
			`"""`
			`Common file-manipulation utilities.`

			`This module is part of wpylib project.`
			`"""`
* Moving pyqmc.utils.file_utils to wpylib project. 14 years ago
			`import bz2`
			`import glob`
			`import gzip`
			`import os`
* wpylib.file.file_utils: added file_exists_nonempty() function. 12 years ago			`import os.path`
* Moving pyqmc.utils.file_utils to wpylib project. 14 years ago			`try:`
			`import subprocess`
			`has_subprocess = True`
			`except:`
			`has_subprocess = False`

* Added support for .xz file extension. * Using backported lzma module (on python <= 3.2) to eliminate subprocess, if possible. 11 years ago			`try:`
			`import lzma`
			`has_lzma = True`
			`except:`
			`try:`
			`from backports import lzma`
			`has_lzma = True`
			`except:`
			`has_lzma = False`


* Use is_iterable to detect iterable specs argument. 11 years ago			`from wpylib.sugar import is_iterable`
* Moving pyqmc.utils.file_utils to wpylib project. 14 years ago
			`class super_file(object):`
			`'''"Super-file" hack wrapper for a file-like object.`
			`Intended to allow extra capabilities to file-like iterators such as:`
			`* ability to push back text lines for the subsequent next() calls.`
			`This is to provide some level of rewinding in parsing text files.`
			`* what else?`
			`'''`
			`def __init__(self, obj):`
			`'''Creates a super_file wrapper around the "obj" object.'''`
			`self.obj = obj`
			`self.pushback = []`
			`def __iter__(self):`
			`return self`
			`def close(self):`
			`return self.obj.close()`
			`def flush(self):`
			`return self.obj.flush()`
			`def next(self):`
			`if len(self.pushback) > 0:`
			`return self.pushback.pop()`
			`else:`
			`return self.obj.next()`
			`def push(self, s):`
			`self.pushback.append(s)`


			`def open_input_file(fname, superize=0):`
			`if fname.endswith(".bz2"):`
			`fobj = bz2.BZ2File(fname, "r")`
			`elif fname.endswith(".gz") or fname.endswith(".Z"):`
			`fobj = gzip.GzipFile(fname, "r")`
			`elif fname.endswith(".lzma"):`
			`# until lzma has a "standard" python module, we use "lzma" executable:`
* Added support for .xz file extension. * Using backported lzma module (on python <= 3.2) to eliminate subprocess, if possible. 11 years ago			`if has_lzma:`
			`fobj = lzma.LZMAFile(fname, "r")`
			`else:`
			`from wpylib.shell_tools import is_executable_file`
			`lzma_exe = path_search(os.environ["PATH"].split(os.pathsep),`
			`("lzma", "xz"),`
			`filetest=is_executable_file)`
			`if lzma_exe == None:`
			`raise IOError, "Cannot find lzma or xz executable file."`
			`if has_subprocess:`
			`px = subprocess.Popen((lzma_exe, "-dc", fname), stdout=subprocess.PIPE)`
			`fobj = px.stdout`
			`else:`
			`fobj = os.popen('" -dc "' + fname + '"', "r")`
			`elif fname.endswith(".xz"):`
			`# until lzma has a "standard" python module, we use "lzma" executable:`
			`if has_lzma:`
			`fobj = lzma.LZMAFile(fname, "r")`
			`elif has_subprocess:`
			`px = subprocess.Popen(("xz", "-dc", fname), stdout=subprocess.PIPE)`
* Moving pyqmc.utils.file_utils to wpylib project. 14 years ago			`fobj = px.stdout`
			`else:`
* Added support for .xz file extension. * Using backported lzma module (on python <= 3.2) to eliminate subprocess, if possible. 11 years ago			`fobj = os.popen('xz -dc "' + fname + '"', "r")`
* Moving pyqmc.utils.file_utils to wpylib project. 14 years ago			`else:`
			`fobj = open(fname, "r")`

			`if superize:`
			`return super_file(fobj)`
			`else:`
			`return fobj`


* Added function scan_directories() -- initial version for scanning recursively all through directories (no symlink following), to collect all subdirectories that satisfy the test criteria defined in the testdir() function argument. 11 years ago			`# Miscellaneous functions:`
			`# - globbing`
			`# - file searches and scans`
* Moving pyqmc.utils.file_utils to wpylib project. 14 years ago
* wpylib.file.file_utils: added file_exists_nonempty() function. 12 years ago
* Moving pyqmc.utils.file_utils to wpylib project. 14 years ago			`def glob_files(filespec):`
			`'''Processes a glob string, or does nothing (pass-on only) if an iterable object`
			`(e.g. list or tuple) is already given.`
			`When globbing is done, the result is sorted for predictability.'''`
			`if getattr(filespec, "__iter__", False):`
			`return filespec # no re-sorting`
* Changing all `isinstance(STUFF, str)' to `isinstance(STUFF, basestring)' for future-proofing this code. 13 years ago			`elif isinstance(filespec, basestring):`
* Moving pyqmc.utils.file_utils to wpylib project. 14 years ago			`return sorted(glob.glob(filespec))`
			`else:`
			`raise ValueError, "Don't know how to glob for an object of " + type(filespec)`


			`def path_search(specs, *opts):`
			`'''Generalized path search.`
			`Multiple paths can be specified for different parts of the sought filename,`
			`and the first file found is returned.`

			`Additional options:`
			`* pathsep="/" -- path separator`
			`* filetest=os.path.isfile -- filetest operator to be used`
			`* raise_error=False -- do we want to raise an exception if the file`
* wpylib.file.file_utils: added file_exists_nonempty() function. 12 years ago			`is not found after all possible searches?`
* Moving pyqmc.utils.file_utils to wpylib project. 14 years ago			`'''`
* wpylib.file.file_utils: added file_exists_nonempty() function. 12 years ago			`path_join = os.path.join`
* Moving pyqmc.utils.file_utils to wpylib project. 14 years ago			`# FIXME: this can be extremely expensive!`
			`xspecs = []`
			`xlen = []`
			`xstride = []`
			`xtot = 1`
			`pathsep = opts.get("pathsep", "/")`
			`filetest = opts.get("filetest", os.path.isfile)`

			`for spec in specs:`
* Use is_iterable to detect iterable specs argument. 11 years ago			`if not is_iterable(spec): # maybe a string?`
* Moving pyqmc.utils.file_utils to wpylib project. 14 years ago			`xspecs.append((spec,))`
			`xlen.append(1)`
			`else:`
			`xspecs.append(tuple([ x for x in spec ]))`
			`xlen.append(len(xspecs[-1]))`
			`xstride.append(xtot)`
			`xtot *= xlen[-1]`

			`for idx in xrange(xtot):`
			`idx0 = idx`
			`# Construct the filename based on the index: we reconstruct`
			`# the indices for all the parts given in the argument, then`
			`# concatenate them to get the full pathname`
			`s = ""`
			`for d in xrange(len(xspecs)-1,-1,-1):`
			`a = idx0 / xstride[d]`
			`if s == "":`
			`s = xspecs[d][a]`
			`else:`
			`s = xspecs[d][a] + pathsep + s`
			`idx0 = idx0 % xstride[d]`
			`#print a,`
			`#print s`
			`if filetest(s):`
			`return s`

			`if opts.get("raise_error", False):`
			`raise ValueError, "Cannot find file with specified combination"`
			`else:`
			`return None`


* Added function scan_directories() -- initial version for scanning recursively all through directories (no symlink following), to collect all subdirectories that satisfy the test criteria defined in the testdir() function argument. 11 years ago			`def scan_directories(D, testdir):`
			`"""Recursively scans a directory tree for candidate of`
			`relevant directories, where testdir(D,dirs,files)`
			`return a True boolean value.`

			`We will not follow symlinks.`

			`The testdir function must have this kind of prototype:`

			`testdir(D, dirs, files)`

			`where:`

			`- D (first positional argument) is the directory under consideration`
			`- dirs (named argument) is a list containing all subdirectory entries`
			`contained in D (symlinks or not).`
			`- files (named argument) is a list containing all non-subdirectory`
			`entries contained in D (other symlinks, files, pipes, sockets, etc).`
			`"""`
			`rslt = []`
			`for (d, dirs, files) in os.walk(D, topdown=True):`
			`if testdir(d, dirs=dirs, files=files):`
			`rslt.append(d)`
			`return rslt`


* Moving pyqmc.utils.file_utils to wpylib project. 14 years ago			`def untar(archive, subdir=None, verbose=None, files=[]):`
			`'''Extracts a TAR archive. The destination directory can be given; otherwise`
			`the files are extracted to the current directory.`
			`Assuming GNU tar which accepts -z and -j switches.`
			`LZMA compression is supported via lzma program.`
			`'''`
			`opts = [ 'tar' ]`
			`# Python doc says: "the arguments to the child process must start with the`
			`# name of the command being run"`

			`if subdir:`
			`opts += [ "-C", subdir ]`

			`if archive.endswith(".tar.bz2") or archive.endswith(".tbz2") or archive.endswith(".tbz"):`
			`opts.append("-j")`
			`elif archive.endswith(".tar.Z") or archive.endswith(".tar.gz") or archive.endswith(".tgz"):`
			`opts.append("-z")`
			`elif archive.endswith(".tar.lzma") or archive.endswith(".tza"):`
			`opts.append("--use-compress-program=lzma")`

			`if verbose:`
			`for i in xrange(verbose): opts.append("-v")`

			`opts += [ "-xf", archive ]`
			`opts += files`

			`return os.spawnvp(os.P_WAIT, "tar", opts)`