* text_input.read_items(): added option `end_line_match' and `last_line_match'

(mutually exclusive options) to allow the dataset reading to end upon
  encountering certain text pattern (or a more complicated match, if we
  specify a function for the option value).
  These options can be used to work with the `maxcount' option; the shortest
  of the two (maxcount records read first, or end/last_line_match finds a
  match) will end the reading of the dataset.
master
Wirawan Purwanto 13 years ago
parent 501552a65a
commit ad841e0b90
  1. 44
      iofmt/text_input.py

@ -27,10 +27,25 @@ This module is part of wpylib project.
import re
import numpy
from wpylib.sugar import zip_gen
from wpylib.file.file_utils import open_input_file
from wpylib.py import make_unbound_instance_method
import wpylib.py.im_weakref
def make_match_proc(match):
"""Make matching procedure: simple string becomes regexp,
regexp remains regexp, and other callable object is passed as is."""
if isinstance(match, basestring):
Regexp = re.compile(match)
match_proc = lambda x: Regexp.search(x)
elif hasattr(getattr(match, "search", None), "__call__"):
Regexp = match
match_proc = lambda x: Regexp.search(x)
else:
match_proc = match
return match_proc
class text_input(object):
'''Text input reader with support for UNIX-style comment marker (#) and
standard field separation (tabs and whitespaces).
@ -167,6 +182,7 @@ class text_input(object):
If the tuple contains the third field, it is used as the name of the field;
otherwise the fields are named f0, f1, f2, ....
Preliminary ability to read in complex data has been added!
Complex data (floating-point only) must be specified as a tuple of two columns
containing the real and imaginary data, like this:
((2, 3), complex, 'ampl')
@ -177,8 +193,13 @@ class text_input(object):
Additional keyword options:
* deftype: default datatype
* maxcount: maximum number of records to be read
* end_line_match: a regular expression or test subroutine accepting a
single argument (i.e. the text line) marking the end boundary of the list
to be read (i.e. one line past the list contents)
* last_line_match: a regular expression or test subroutine accepting a
single argument (i.e. the text line) marking the last element of the list
to be read
TODO: Needs ability to read in complex data.
"""
deftype = kwd.get("deftype", float)
@ -226,7 +247,28 @@ class text_input(object):
cols = reg.cols
flds = reg.flds
get_fields = lambda vals : tuple([ filt(vals,col) for (filt,col) in cols ])
if "maxcount" in kwd:
src_iter = zip_gen(xrange(kwd['maxcount']),self)
else:
src_iter = enumerate(self)
# FIXME below: zip() evaluates the function before the loop, thus may
# eat a lot of memory.
if 'end_line_match' in kwd:
rslt = []
match = make_match_proc(kwd['end_line_match'])
for (c,vals) in src_iter:
if match(vals):
break
rslt.append(get_fields(vals.split()))
elif 'last_line_match' in kwd:
rslt = []
match = make_match_proc(kwd['end_line_match'])
for (c,vals) in src_iter:
rslt.append(get_fields(vals.split()))
if match(vals):
break
elif "maxcount" in kwd:
#print "hello"
rslt = [ get_fields(vals.split()) for (c,vals) in zip(xrange(kwd['maxcount']),self) ]
else:

Loading…
Cancel
Save