* First module: file_db to store filename, fingerprint (md5sum) and basic stats (mtime, size). Additional fields can be specified as well. * Will use sqlite as the backend for now. * API is rather generic so the underlying database engine can be replaced.master
parent
9de91dcc59
commit
669e48b8d3
3 changed files with 205 additions and 0 deletions
@ -0,0 +1,3 @@ |
|||||||
|
*.pyc |
||||||
|
*.pyo |
||||||
|
*.old* |
@ -0,0 +1,11 @@ |
|||||||
|
# $Id: __init__.py,v 1.1 2010-02-06 23:21:09 wirawan Exp $ |
||||||
|
# |
||||||
|
# wpylib.db main module |
||||||
|
# Created: 20100205 |
||||||
|
# Wirawan Purwanto |
||||||
|
# |
||||||
|
# Database-related stuff. |
||||||
|
|
||||||
|
"""Database-related stuff.""" |
||||||
|
|
||||||
|
pass |
@ -0,0 +1,191 @@ |
|||||||
|
# $Id: file_db.py,v 1.1 2010-02-06 23:21:09 wirawan Exp $ |
||||||
|
# |
||||||
|
# wpylib.db.filedb module |
||||||
|
# Created: 20100205 |
||||||
|
# Wirawan Purwanto |
||||||
|
# |
||||||
|
|
||||||
|
"""File fingerprint database.""" |
||||||
|
|
||||||
|
import md5 |
||||||
|
import numpy |
||||||
|
import os.path |
||||||
|
import time |
||||||
|
|
||||||
|
try: |
||||||
|
import sqlite3 |
||||||
|
except: |
||||||
|
import pysqlite2 as sqlite3 |
||||||
|
|
||||||
|
class file_rec(tuple): |
||||||
|
pass |
||||||
|
|
||||||
|
class file_db(object): |
||||||
|
# dtype for numpy (if wanted) |
||||||
|
dtype = numpy.dtype([ |
||||||
|
('filename', 'S256'), |
||||||
|
('md5', 'S32'), |
||||||
|
('date', 'i4'), |
||||||
|
('time', 'i4'), |
||||||
|
('size', 'i8'), |
||||||
|
]) |
||||||
|
# dtype map from python types to sqlite3 types: |
||||||
|
sqlite_dtype_map = { |
||||||
|
str: 'TEXT', |
||||||
|
int: 'INTEGER', |
||||||
|
float: 'REAL', |
||||||
|
} |
||||||
|
|
||||||
|
def __init__(self, src_name, table_name='filedb', extra_fields=[]): |
||||||
|
self.src_name = src_name |
||||||
|
self.table_name = table_name |
||||||
|
if os.path.isfile(src_name): |
||||||
|
self.db = sqlite3.connect(src_name) |
||||||
|
self.dbc = self.db.cursor() |
||||||
|
else: |
||||||
|
self.db = sqlite3.connect(src_name) |
||||||
|
self.dbc = self.db.cursor() |
||||||
|
self.db.text_factory = str |
||||||
|
self.sql_params = { |
||||||
|
'table_name': table_name, |
||||||
|
} |
||||||
|
self.debug = 1 |
||||||
|
|
||||||
|
create_sql = """\ |
||||||
|
CREATE TABLE IF NOT EXISTS '%(table_name)s' ( |
||||||
|
filename TEXT, |
||||||
|
md5sum TEXT, |
||||||
|
date INTEGER, |
||||||
|
time INTEGER, |
||||||
|
size INTEGER""" \ |
||||||
|
+ "".join([", '%s' %s" % (dname, self.sqlite_dtype_map[dtyp]) |
||||||
|
for (dname,dtyp) in extra_fields |
||||||
|
]) \ |
||||||
|
+ """ |
||||||
|
); |
||||||
|
""" |
||||||
|
self.exec_sql(create_sql) |
||||||
|
self.db.commit() |
||||||
|
|
||||||
|
def exec_sql(self, stmt, params=None): |
||||||
|
sql_stmt = stmt % self.sql_params |
||||||
|
if params: |
||||||
|
if self.debug: |
||||||
|
print "--SQL::", sql_stmt.rstrip() |
||||||
|
print "--val::", params |
||||||
|
return self.dbc.execute(sql_stmt, params) |
||||||
|
else: |
||||||
|
if self.debug: |
||||||
|
print "--SQL::", sql_stmt.rstrip() |
||||||
|
return self.dbc.execute(sql_stmt) |
||||||
|
|
||||||
|
def add_fields(self, dtypes): |
||||||
|
"""Adds columns to the table.""" |
||||||
|
for (dname, dtyp) in dtypes: |
||||||
|
self.exec_sql("ALTER TABLE '%(table_name)s' ADD COLUMN" \ |
||||||
|
+ " '%s' %s;" % (dname, self.sqlite_dtype_map[dtyp]) |
||||||
|
) |
||||||
|
self.db.commit() |
||||||
|
|
||||||
|
def register_file(self, filename, replace=False, extra_values=None): |
||||||
|
"""Register a file, note its mtime, and size, and digests its content.""" |
||||||
|
filestats = get_file_stats(filename) |
||||||
|
fields = [ |
||||||
|
('md5sum', filestats['md5sum']), |
||||||
|
('date', filestats['mdate']), |
||||||
|
('time', filestats['mtime']), |
||||||
|
('size', filestats['size']), |
||||||
|
] + [ |
||||||
|
kwpair for kwpair in extra_values |
||||||
|
] |
||||||
|
dnames = [ dname for (dname,dval) in fields ] |
||||||
|
dvals = [ dval for (dname,dval) in fields ] |
||||||
|
|
||||||
|
if replace: |
||||||
|
# Test if we want to replace or to add. |
||||||
|
count = [ |
||||||
|
x for x in self.exec_sql( |
||||||
|
"SELECT count(*) from '%(table_name)s' where filename = ?;", |
||||||
|
(filename,) |
||||||
|
) |
||||||
|
][0][0] |
||||||
|
if count == 0: replace = False |
||||||
|
|
||||||
|
if replace: |
||||||
|
# WARNING: This will replace all the occurences of the entry with |
||||||
|
# the same filename. |
||||||
|
# Replaceable insert is not intended for tables with duplicate entries |
||||||
|
# of the same filename. |
||||||
|
insert_sql = "UPDATE '%(table_name)s' SET " \ |
||||||
|
+ ', '.join(["'%s' = ?" % dname for dname in dnames]) \ |
||||||
|
+ " WHERE filename = ?;" |
||||||
|
vals = tuple(dvals + [filename]) |
||||||
|
else: |
||||||
|
insert_sql = "INSERT INTO '%(table_name)s' (filename, " \ |
||||||
|
+ ", ".join(["'%s'" % dname for dname in dnames]) \ |
||||||
|
+ ") VALUES (?" + ',?'*(len(fields)) + ");" |
||||||
|
vals = tuple([filename] + dvals) |
||||||
|
self.exec_sql(insert_sql, vals) |
||||||
|
|
||||||
|
def flush(self): |
||||||
|
self.db.commit() |
||||||
|
|
||||||
|
def get_filenames(self): |
||||||
|
"""Reads all the file names in the table to memory.""" |
||||||
|
return [ |
||||||
|
rslt[0] for rslt in |
||||||
|
self.exec_sql("SELECT filename FROM '%(table_name)s' ORDER BY filename;") |
||||||
|
] |
||||||
|
|
||||||
|
def __getitem__(self, filename): |
||||||
|
"""Reads all the entries matching in the `filename' field.""" |
||||||
|
if filename.find("%") >= 0: |
||||||
|
sql_stmt = "SELECT * FROM '%(table_name)s' WHERE filename LIKE ?;" |
||||||
|
else: |
||||||
|
sql_stmt = "SELECT * FROM '%(table_name)s' WHERE filename = ?;" |
||||||
|
return [ rslt for rslt in self.exec_sql(sql_stmt, (filename,)) ] |
||||||
|
|
||||||
|
def __contains__(self, filename): |
||||||
|
"""Counts the number of record entries matching in the `filename' field.""" |
||||||
|
if filename.find("%") >= 0: |
||||||
|
sql_stmt = "SELECT count(*) FROM '%(table_name)s' WHERE filename LIKE ?;" |
||||||
|
else: |
||||||
|
sql_stmt = "SELECT count(*) FROM '%(table_name)s' WHERE filename = ?;" |
||||||
|
return [ rslt for rslt in self.exec_sql(sql_stmt, (filename,)) ][0][0] |
||||||
|
|
||||||
|
count = __contains__ |
||||||
|
|
||||||
|
|
||||||
|
def md5_digest_file(filename): |
||||||
|
"""Digests the content of a file.""" |
||||||
|
ff = open(filename, "rb") |
||||||
|
bufsize = 32768 |
||||||
|
stuff = ff.read(bufsize) |
||||||
|
digest = md5.new() |
||||||
|
while len(stuff) > 0: |
||||||
|
digest.update(stuff) |
||||||
|
stuff = ff.read(bufsize) |
||||||
|
ff.close() |
||||||
|
return digest.digest() |
||||||
|
|
||||||
|
|
||||||
|
def str2hexstr(md5sum): |
||||||
|
"""Return the hex representation of a string.""" |
||||||
|
return "".join([ "%02x" % ord(c) for c in md5sum ]) |
||||||
|
|
||||||
|
|
||||||
|
def get_file_stats(filename): |
||||||
|
stats = os.stat(filename) |
||||||
|
mtime = time.localtime(stats.st_mtime) |
||||||
|
Mdate = mtime.tm_year * 10000 + mtime.tm_mon * 100 + mtime.tm_mday |
||||||
|
Mtime = mtime.tm_hour * 10000 + mtime.tm_min * 100 + mtime.tm_sec |
||||||
|
size = stats.st_size |
||||||
|
md5sum = str2hexstr(md5_digest_file(filename)) # this step is EXPEN$IVE |
||||||
|
return { |
||||||
|
'filename': filename, |
||||||
|
'mdate': Mdate, |
||||||
|
'mtime': Mtime, |
||||||
|
'size': size, |
||||||
|
'md5sum': md5sum, |
||||||
|
} |
||||||
|
|
Loading…
Reference in new issue