indexing, allowing tolerances to account for imprecise nature of FP numbers. Initial implementation, rather complicated. A simple rounding-based implementation can be put in later. Includes initial test.master
parent
4fb16c56b2
commit
099f3e7e06
2 changed files with 225 additions and 0 deletions
@ -0,0 +1,166 @@ |
|||||||
|
# |
||||||
|
# wpylib.db.indexing_float |
||||||
|
# Utilities for indexing based on floating-point values |
||||||
|
# |
||||||
|
# Wirawan Purwanto |
||||||
|
# Created: 20130301 |
||||||
|
# |
||||||
|
|
||||||
|
"""\ |
||||||
|
wpylib.db.indexing_float |
||||||
|
Utilities for indexing based on floating-point values |
||||||
|
""" |
||||||
|
|
||||||
|
import numpy |
||||||
|
import sys |
||||||
|
|
||||||
|
|
||||||
|
def _debug_gen_float_indices1(localvars, debug): |
||||||
|
from wpylib.params.params_flat import Parameters as params |
||||||
|
L = params(localvars) |
||||||
|
if debug > 50: |
||||||
|
print "a_sorted = ", L.a_sorted[1:] |
||||||
|
print "a_diff = ", L.a_diff |
||||||
|
print "a_avg_abs = ", L.a_avg_abs |
||||||
|
print "a_rdiff = ", L.a_rdiff |
||||||
|
print |
||||||
|
#print "rdiff_idx_sorted = ", L.rdiff_idx_sorted # numpy.array(L.rdiff_idx_sorted, dtype=float) |
||||||
|
print "rdiff_idx_sorted = ", " ".join([ "%11d" % i for i in L.rdiff_idx_sorted ]) |
||||||
|
print "too_close = ", " ".join([ "%11d" % int(i) for i in (L.a_rdiff[L.rdiff_idx_sorted] < L.rdiff_threshold) ]) |
||||||
|
print "a_rdiff(sort) = ", L.a_rdiff[L.rdiff_idx_sorted] |
||||||
|
print "a(sort) = ", L.a_sorted[1:][L.rdiff_idx_sorted] |
||||||
|
print |
||||||
|
|
||||||
|
def _debug_gen_float_indices2(localvars, debug): |
||||||
|
from wpylib.params.params_flat import Parameters as params |
||||||
|
L = params(localvars) |
||||||
|
if debug > 50: |
||||||
|
print |
||||||
|
print "a_rdiff aft = ", L.a_rdiff |
||||||
|
print "num unique vals = ", L.n_all_unique_vals |
||||||
|
print "num already uniq = ", len(L.a_already_unique) |
||||||
|
print "unique_vals = ", L.unique_vals[0:L.n_all_unique_vals] |
||||||
|
print "unique_vals(sort)= ", numpy.sort(L.unique_vals[0:L.n_all_unique_vals]) |
||||||
|
|
||||||
|
def _debug_gen_float_indices_found_duplicates(localvars, debug): |
||||||
|
from wpylib.params.params_flat import Parameters as params |
||||||
|
L = params(localvars) |
||||||
|
if debug > 100: |
||||||
|
print "i=", L.i_found, " fused range is ", L.i1, ":", L.i+1 |
||||||
|
print " rdiff", L.orig_rdiff |
||||||
|
print " idx ", L.i1, L.i, ", arr ", L.a_fused_sect |
||||||
|
print " avg ", L.avg |
||||||
|
|
||||||
|
def _debug_gen_float_indices_results(localvars, debug): |
||||||
|
from wpylib.params.params_flat import Parameters as params |
||||||
|
L = params(localvars) |
||||||
|
if debug > 50: |
||||||
|
print |
||||||
|
print "rslt_vals = ", L.rslt_vals |
||||||
|
print "unique_map = ", L.unique_map |
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def generate_float_indices(arr, rdiff_threshold, debug=0): |
||||||
|
"""Consolidates floating point values to `unique' values whose relative |
||||||
|
differences are greater than a specified threshold (rdiff_threshold). |
||||||
|
Values that are so close together will fused to their average. |
||||||
|
|
||||||
|
The input must be a one-dimensional array or list or a list-like iterable. |
||||||
|
""" |
||||||
|
from wpylib.db.result_base import result_base |
||||||
|
sample = numpy.array([arr[0]]) |
||||||
|
a_sorted = numpy.empty(len(arr)+1, dtype=sample.dtype) |
||||||
|
a_sorted[1:] = arr |
||||||
|
a_sorted[1:].sort(kind='heapsort') |
||||||
|
a_sorted[0] = a_sorted[1] # dummy data |
||||||
|
a_diff = numpy.diff(a_sorted) # == a_sorted[1:] - a_sorted[:-1] |
||||||
|
a_avg_abs = (numpy.abs(a_sorted[1:]) + numpy.abs(a_sorted[:-1])) * 0.5 |
||||||
|
a_rdiff = numpy.abs(a_diff) / a_avg_abs |
||||||
|
# hack the first rdiff since this element *must* always be present, |
||||||
|
# so this trick marks it as "unique": |
||||||
|
a_rdiff[0] = rdiff_threshold*100 |
||||||
|
# free up the memory: |
||||||
|
if not debug: |
||||||
|
a_diff = None |
||||||
|
a_avg_abs = None |
||||||
|
# Elements whose rdiff < rdiff_cutoff should be consolidated. |
||||||
|
# Since there is no easy way to find these elements in bulk, |
||||||
|
# I resort to "sorting": :( |
||||||
|
rdiff_idx_sorted = numpy.argsort(a_rdiff, kind='mergesort') |
||||||
|
|
||||||
|
_debug_gen_float_indices1(locals(), debug) |
||||||
|
|
||||||
|
imax = len(rdiff_idx_sorted) |
||||||
|
# unique_map: mapping from original indices to unique indices |
||||||
|
unique_map = {} |
||||||
|
# unique_set: set of unique-ized elements, excluding those that |
||||||
|
# are distinct by their numerical distances |
||||||
|
unique_vals = numpy.empty((len(arr),), dtype= sample.dtype) # max len |
||||||
|
n_unique_vals = 0 |
||||||
|
rslt = None |
||||||
|
for (last_idx,i) in enumerate(rdiff_idx_sorted): |
||||||
|
if a_rdiff[i] > rdiff_threshold: |
||||||
|
# Stop, all the rest of the values are unique. |
||||||
|
break |
||||||
|
elif a_rdiff[i] == -1: |
||||||
|
continue |
||||||
|
else: |
||||||
|
# If two values are adjacent (e.g. in this case |
||||||
|
# a_sorted[i] and a_sorted[i+1] -- note the dummy value |
||||||
|
# at element 0), there may be more than one values like that, |
||||||
|
# so we need to take care of that too. |
||||||
|
# This is why the lower bound of the indices below is "i1" |
||||||
|
# while the upper is "i". |
||||||
|
i_found = i |
||||||
|
i1 = i |
||||||
|
|
||||||
|
while i1 > 0 and a_rdiff[i1-1] <= rdiff_threshold: i1 -= 1 |
||||||
|
i += 1 |
||||||
|
while i < imax and a_rdiff[i] <= rdiff_threshold: i += 1 |
||||||
|
orig_rdiff = a_rdiff[i1-1:i].copy() |
||||||
|
a_rdiff[i1-1:i] = -1 |
||||||
|
|
||||||
|
a_fused_sect = a_sorted[i1:i+1] |
||||||
|
avg = numpy.mean(a_fused_sect) |
||||||
|
unique_vals[n_unique_vals] = avg |
||||||
|
for a in a_fused_sect: |
||||||
|
unique_map[a] = n_unique_vals |
||||||
|
n_unique_vals += 1 |
||||||
|
|
||||||
|
_debug_gen_float_indices_found_duplicates(locals(), debug) |
||||||
|
|
||||||
|
# unique_vals will contain the unique elements. |
||||||
|
# - Then, copy over the rest elements who are already unique |
||||||
|
# - Also, complete the value-to-index lookup |
||||||
|
a_already_unique = [ a_sorted[i+1] for i in rdiff_idx_sorted[last_idx:] if a_rdiff[i] != -1 ] |
||||||
|
n_all_unique_vals = n_unique_vals + len(a_already_unique) |
||||||
|
unique_vals[n_unique_vals:n_all_unique_vals] = a_already_unique |
||||||
|
_debug_gen_float_indices2(locals(), debug) |
||||||
|
|
||||||
|
dn = 0 |
||||||
|
for i in rdiff_idx_sorted[last_idx:]: |
||||||
|
if a_rdiff[i] == -1: continue |
||||||
|
a = a_sorted[i+1] |
||||||
|
unique_map[a] = n_unique_vals + dn |
||||||
|
dn += 1 |
||||||
|
|
||||||
|
# Sort the indices based on the unique value |
||||||
|
rslt_sort_idx = unique_vals[:n_all_unique_vals].argsort(kind='heapsort') |
||||||
|
rslt_sort_ridx = dict((b,a) for (a,b) in enumerate(rslt_sort_idx)) |
||||||
|
|
||||||
|
# Update the value-to-index lookup and return the sorted index array |
||||||
|
for a in unique_map.keys(): |
||||||
|
#unique_map[a] = rslt_sort_idx[unique_map[a]] |
||||||
|
unique_map[a] = rslt_sort_ridx[unique_map[a]] |
||||||
|
rslt_vals = unique_vals[rslt_sort_idx] |
||||||
|
|
||||||
|
_debug_gen_float_indices_results(locals(), debug) |
||||||
|
|
||||||
|
return result_base( |
||||||
|
# list of unique indices, sorted in ascending order: |
||||||
|
vals=rslt_vals, |
||||||
|
# mapping from less-unique values to the index of the new (unique-ized) new , sorted in ascending order |
||||||
|
index_mapping=unique_map, |
||||||
|
) |
||||||
|
|
@ -0,0 +1,59 @@ |
|||||||
|
from numpy import array, concatenate |
||||||
|
from wpylib.db.indexing_float import generate_float_indices |
||||||
|
|
||||||
|
indices1 = array([ 0.80038202, 0.28583295, 0.13505145, 0.79425102, 0.52347217, 0.47955401, 0.07961833, 0.1024241 , 0.26336713, 0.15990201, 0.81311686, 0.98632763, 0.08275991, |
||||||
|
0.56862337, 0.5679713 , 0.04377884, 0.93023717, 0.60270102, 0.24538933, 0.63922544]) |
||||||
|
indices2 = array([ 0.69053462, 0.09864655, 0.86209023, 0.26140917, 0.8086512 , 0.13796145, 0.1770305 , 0.05061917, 0.81191537, 0.72801096, 0.01129504, 0.13962617, 0.56217892, |
||||||
|
0.94299591, 0.99302594, 0.01167897, 0.54827444, 0.20160252, 0.86603525, 0.20260494]) |
||||||
|
|
||||||
|
|
||||||
|
def Test_1(): |
||||||
|
indices_raw = concatenate((indices1, indices2)) |
||||||
|
keys1 = numpy.sort(indices_raw) |
||||||
|
keys1_test10 = keys1[-10:] |
||||||
|
|
||||||
|
ans = generate_float_indices(keys1_test10, 1e-2, debug=101) |
||||||
|
"""ans must be: |
||||||
|
{ |
||||||
|
'vals': array([ 0.80038202, 0.81122781, 0.86406274, 0.93023717, 0.94299591, 0.98967679]), |
||||||
|
'index_mapping': \ |
||||||
|
{0.80038201815850551: 0, |
||||||
|
0.80865119885060532: 1, |
||||||
|
0.81191536625506044: 1, |
||||||
|
0.8131168633197402: 1, |
||||||
|
0.8620902343091833: 2, |
||||||
|
0.86603524560901635: 2, |
||||||
|
0.93023716796725509: 3, |
||||||
|
0.94299590915079168: 4, |
||||||
|
0.98632763033630222: 5, |
||||||
|
0.99302594015368861: 5} |
||||||
|
} |
||||||
|
""" |
||||||
|
return ans |
||||||
|
|
||||||
|
|
||||||
|
def Test_1b(): |
||||||
|
indices_raw = concatenate((indices1, indices2)) |
||||||
|
keys1 = numpy.sort(indices_raw) |
||||||
|
keys1_test10 = concatenate((keys1[-10:], [1.03])) |
||||||
|
|
||||||
|
ans = generate_float_indices(keys1_test10, 1e-2, debug=101) |
||||||
|
"""ans must be: |
||||||
|
{ |
||||||
|
'vals': array([ 0.80038202, 0.81122781, 0.86406274, 0.93023717, 0.94299591, 0.98967679, 1.03 ]), |
||||||
|
'index_mapping': \ |
||||||
|
{0.80038202000000003: 0, |
||||||
|
0.80865120000000001: 1, |
||||||
|
0.81191537000000003: 1, |
||||||
|
0.81311686000000005: 1, |
||||||
|
0.86209022999999996: 2, |
||||||
|
0.86603525000000003: 2, |
||||||
|
0.93023716999999995: 3, |
||||||
|
0.94299591000000005: 4, |
||||||
|
0.98632763000000001: 5, |
||||||
|
0.99302594: 5, |
||||||
|
1.03: 6} |
||||||
|
} |
||||||
|
""" |
||||||
|
return ans |
||||||
|
|
Loading…
Reference in new issue