From 669e48b8d3ab34aa9b7b5271fcbafc8e0573df82 Mon Sep 17 00:00:00 2001 From: wirawan Date: Sat, 6 Feb 2010 23:21:09 +0000 Subject: [PATCH] * Added "db" module group to deal with database management tasks. * First module: file_db to store filename, fingerprint (md5sum) and basic stats (mtime, size). Additional fields can be specified as well. * Will use sqlite as the backend for now. * API is rather generic so the underlying database engine can be replaced. --- db/.cvsignore | 3 + db/__init__.py | 11 +++ db/file_db.py | 191 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 205 insertions(+) create mode 100644 db/.cvsignore create mode 100644 db/__init__.py create mode 100644 db/file_db.py diff --git a/db/.cvsignore b/db/.cvsignore new file mode 100644 index 0000000..c61b055 --- /dev/null +++ b/db/.cvsignore @@ -0,0 +1,3 @@ +*.pyc +*.pyo +*.old* diff --git a/db/__init__.py b/db/__init__.py new file mode 100644 index 0000000..617dbcb --- /dev/null +++ b/db/__init__.py @@ -0,0 +1,11 @@ +# $Id: __init__.py,v 1.1 2010-02-06 23:21:09 wirawan Exp $ +# +# wpylib.db main module +# Created: 20100205 +# Wirawan Purwanto +# +# Database-related stuff. + +"""Database-related stuff.""" + +pass diff --git a/db/file_db.py b/db/file_db.py new file mode 100644 index 0000000..e613001 --- /dev/null +++ b/db/file_db.py @@ -0,0 +1,191 @@ +# $Id: file_db.py,v 1.1 2010-02-06 23:21:09 wirawan Exp $ +# +# wpylib.db.filedb module +# Created: 20100205 +# Wirawan Purwanto +# + +"""File fingerprint database.""" + +import md5 +import numpy +import os.path +import time + +try: + import sqlite3 +except: + import pysqlite2 as sqlite3 + +class file_rec(tuple): + pass + +class file_db(object): + # dtype for numpy (if wanted) + dtype = numpy.dtype([ + ('filename', 'S256'), + ('md5', 'S32'), + ('date', 'i4'), + ('time', 'i4'), + ('size', 'i8'), + ]) + # dtype map from python types to sqlite3 types: + sqlite_dtype_map = { + str: 'TEXT', + int: 'INTEGER', + float: 'REAL', + } + + def __init__(self, src_name, table_name='filedb', extra_fields=[]): + self.src_name = src_name + self.table_name = table_name + if os.path.isfile(src_name): + self.db = sqlite3.connect(src_name) + self.dbc = self.db.cursor() + else: + self.db = sqlite3.connect(src_name) + self.dbc = self.db.cursor() + self.db.text_factory = str + self.sql_params = { + 'table_name': table_name, + } + self.debug = 1 + + create_sql = """\ + CREATE TABLE IF NOT EXISTS '%(table_name)s' ( + filename TEXT, + md5sum TEXT, + date INTEGER, + time INTEGER, + size INTEGER""" \ + + "".join([", '%s' %s" % (dname, self.sqlite_dtype_map[dtyp]) + for (dname,dtyp) in extra_fields + ]) \ + + """ + ); + """ + self.exec_sql(create_sql) + self.db.commit() + + def exec_sql(self, stmt, params=None): + sql_stmt = stmt % self.sql_params + if params: + if self.debug: + print "--SQL::", sql_stmt.rstrip() + print "--val::", params + return self.dbc.execute(sql_stmt, params) + else: + if self.debug: + print "--SQL::", sql_stmt.rstrip() + return self.dbc.execute(sql_stmt) + + def add_fields(self, dtypes): + """Adds columns to the table.""" + for (dname, dtyp) in dtypes: + self.exec_sql("ALTER TABLE '%(table_name)s' ADD COLUMN" \ + + " '%s' %s;" % (dname, self.sqlite_dtype_map[dtyp]) + ) + self.db.commit() + + def register_file(self, filename, replace=False, extra_values=None): + """Register a file, note its mtime, and size, and digests its content.""" + filestats = get_file_stats(filename) + fields = [ + ('md5sum', filestats['md5sum']), + ('date', filestats['mdate']), + ('time', filestats['mtime']), + ('size', filestats['size']), + ] + [ + kwpair for kwpair in extra_values + ] + dnames = [ dname for (dname,dval) in fields ] + dvals = [ dval for (dname,dval) in fields ] + + if replace: + # Test if we want to replace or to add. + count = [ + x for x in self.exec_sql( + "SELECT count(*) from '%(table_name)s' where filename = ?;", + (filename,) + ) + ][0][0] + if count == 0: replace = False + + if replace: + # WARNING: This will replace all the occurences of the entry with + # the same filename. + # Replaceable insert is not intended for tables with duplicate entries + # of the same filename. + insert_sql = "UPDATE '%(table_name)s' SET " \ + + ', '.join(["'%s' = ?" % dname for dname in dnames]) \ + + " WHERE filename = ?;" + vals = tuple(dvals + [filename]) + else: + insert_sql = "INSERT INTO '%(table_name)s' (filename, " \ + + ", ".join(["'%s'" % dname for dname in dnames]) \ + + ") VALUES (?" + ',?'*(len(fields)) + ");" + vals = tuple([filename] + dvals) + self.exec_sql(insert_sql, vals) + + def flush(self): + self.db.commit() + + def get_filenames(self): + """Reads all the file names in the table to memory.""" + return [ + rslt[0] for rslt in + self.exec_sql("SELECT filename FROM '%(table_name)s' ORDER BY filename;") + ] + + def __getitem__(self, filename): + """Reads all the entries matching in the `filename' field.""" + if filename.find("%") >= 0: + sql_stmt = "SELECT * FROM '%(table_name)s' WHERE filename LIKE ?;" + else: + sql_stmt = "SELECT * FROM '%(table_name)s' WHERE filename = ?;" + return [ rslt for rslt in self.exec_sql(sql_stmt, (filename,)) ] + + def __contains__(self, filename): + """Counts the number of record entries matching in the `filename' field.""" + if filename.find("%") >= 0: + sql_stmt = "SELECT count(*) FROM '%(table_name)s' WHERE filename LIKE ?;" + else: + sql_stmt = "SELECT count(*) FROM '%(table_name)s' WHERE filename = ?;" + return [ rslt for rslt in self.exec_sql(sql_stmt, (filename,)) ][0][0] + + count = __contains__ + + +def md5_digest_file(filename): + """Digests the content of a file.""" + ff = open(filename, "rb") + bufsize = 32768 + stuff = ff.read(bufsize) + digest = md5.new() + while len(stuff) > 0: + digest.update(stuff) + stuff = ff.read(bufsize) + ff.close() + return digest.digest() + + +def str2hexstr(md5sum): + """Return the hex representation of a string.""" + return "".join([ "%02x" % ord(c) for c in md5sum ]) + + +def get_file_stats(filename): + stats = os.stat(filename) + mtime = time.localtime(stats.st_mtime) + Mdate = mtime.tm_year * 10000 + mtime.tm_mon * 100 + mtime.tm_mday + Mtime = mtime.tm_hour * 10000 + mtime.tm_min * 100 + mtime.tm_sec + size = stats.st_size + md5sum = str2hexstr(md5_digest_file(filename)) # this step is EXPEN$IVE + return { + 'filename': filename, + 'mdate': Mdate, + 'mtime': Mtime, + 'size': size, + 'md5sum': md5sum, + } +