First commit
This commit is contained in:
189
libsvm-3.36/python/libsvm/commonutil.py
Normal file
189
libsvm-3.36/python/libsvm/commonutil.py
Normal file
@@ -0,0 +1,189 @@
|
||||
from __future__ import print_function
|
||||
from array import array
|
||||
import sys
|
||||
|
||||
try:
|
||||
import numpy as np
|
||||
import scipy
|
||||
from scipy import sparse
|
||||
except:
|
||||
scipy = None
|
||||
|
||||
|
||||
__all__ = ['svm_read_problem', 'evaluations', 'csr_find_scale_param', 'csr_scale']
|
||||
|
||||
def svm_read_problem(data_source, return_scipy=False):
|
||||
"""
|
||||
svm_read_problem(data_source, return_scipy=False) -> [y, x], y: list, x: list of dictionary
|
||||
svm_read_problem(data_source, return_scipy=True) -> [y, x], y: ndarray, x: csr_matrix
|
||||
|
||||
Read LIBSVM-format data from data_source and return labels y
|
||||
and data instances x.
|
||||
"""
|
||||
if scipy != None and return_scipy:
|
||||
prob_y = array('d')
|
||||
prob_x = array('d')
|
||||
row_ptr = array('l', [0])
|
||||
col_idx = array('l')
|
||||
else:
|
||||
prob_y = []
|
||||
prob_x = []
|
||||
row_ptr = [0]
|
||||
col_idx = []
|
||||
indx_start = 1
|
||||
|
||||
if hasattr(data_source, "read"):
|
||||
file = data_source
|
||||
else:
|
||||
file = open(data_source)
|
||||
try:
|
||||
for line in file:
|
||||
line = line.split(None, 1)
|
||||
# In case an instance with all zero features
|
||||
if len(line) == 1: line += ['']
|
||||
label, features = line
|
||||
prob_y.append(float(label))
|
||||
if scipy != None and return_scipy:
|
||||
nz = 0
|
||||
for e in features.split():
|
||||
ind, val = e.split(":")
|
||||
if ind == '0':
|
||||
indx_start = 0
|
||||
val = float(val)
|
||||
if val != 0:
|
||||
col_idx.append(int(ind)-indx_start)
|
||||
prob_x.append(val)
|
||||
nz += 1
|
||||
row_ptr.append(row_ptr[-1]+nz)
|
||||
else:
|
||||
xi = {}
|
||||
for e in features.split():
|
||||
ind, val = e.split(":")
|
||||
xi[int(ind)] = float(val)
|
||||
prob_x += [xi]
|
||||
except Exception as err_msg:
|
||||
raise err_msg
|
||||
finally:
|
||||
if not hasattr(data_source, "read"):
|
||||
# close file only if it was created by us
|
||||
file.close()
|
||||
|
||||
if scipy != None and return_scipy:
|
||||
prob_y = np.frombuffer(prob_y, dtype='d')
|
||||
prob_x = np.frombuffer(prob_x, dtype='d')
|
||||
col_idx = np.frombuffer(col_idx, dtype='l')
|
||||
row_ptr = np.frombuffer(row_ptr, dtype='l')
|
||||
prob_x = sparse.csr_matrix((prob_x, col_idx, row_ptr))
|
||||
return (prob_y, prob_x)
|
||||
|
||||
def evaluations_scipy(ty, pv):
|
||||
"""
|
||||
evaluations_scipy(ty, pv) -> (ACC, MSE, SCC)
|
||||
ty, pv: ndarray
|
||||
|
||||
Calculate accuracy, mean squared error and squared correlation coefficient
|
||||
using the true values (ty) and predicted values (pv).
|
||||
"""
|
||||
if not (scipy != None and isinstance(ty, np.ndarray) and isinstance(pv, np.ndarray)):
|
||||
raise TypeError("type of ty and pv must be ndarray")
|
||||
if len(ty) != len(pv):
|
||||
raise ValueError("len(ty) must be equal to len(pv)")
|
||||
ACC = 100.0*(ty == pv).mean()
|
||||
MSE = ((ty - pv)**2).mean()
|
||||
l = len(ty)
|
||||
sumv = pv.sum()
|
||||
sumy = ty.sum()
|
||||
sumvy = (pv*ty).sum()
|
||||
sumvv = (pv*pv).sum()
|
||||
sumyy = (ty*ty).sum()
|
||||
with np.errstate(all = 'raise'):
|
||||
try:
|
||||
SCC = ((l*sumvy-sumv*sumy)*(l*sumvy-sumv*sumy))/((l*sumvv-sumv*sumv)*(l*sumyy-sumy*sumy))
|
||||
except:
|
||||
SCC = float('nan')
|
||||
return (float(ACC), float(MSE), float(SCC))
|
||||
|
||||
def evaluations(ty, pv, useScipy = True):
|
||||
"""
|
||||
evaluations(ty, pv, useScipy) -> (ACC, MSE, SCC)
|
||||
ty, pv: list, tuple or ndarray
|
||||
useScipy: convert ty, pv to ndarray, and use scipy functions for the evaluation
|
||||
|
||||
Calculate accuracy, mean squared error and squared correlation coefficient
|
||||
using the true values (ty) and predicted values (pv).
|
||||
"""
|
||||
if scipy != None and useScipy:
|
||||
return evaluations_scipy(np.asarray(ty), np.asarray(pv))
|
||||
if len(ty) != len(pv):
|
||||
raise ValueError("len(ty) must be equal to len(pv)")
|
||||
total_correct = total_error = 0
|
||||
sumv = sumy = sumvv = sumyy = sumvy = 0
|
||||
for v, y in zip(pv, ty):
|
||||
if y == v:
|
||||
total_correct += 1
|
||||
total_error += (v-y)*(v-y)
|
||||
sumv += v
|
||||
sumy += y
|
||||
sumvv += v*v
|
||||
sumyy += y*y
|
||||
sumvy += v*y
|
||||
l = len(ty)
|
||||
ACC = 100.0*total_correct/l
|
||||
MSE = total_error/l
|
||||
try:
|
||||
SCC = ((l*sumvy-sumv*sumy)*(l*sumvy-sumv*sumy))/((l*sumvv-sumv*sumv)*(l*sumyy-sumy*sumy))
|
||||
except:
|
||||
SCC = float('nan')
|
||||
return (float(ACC), float(MSE), float(SCC))
|
||||
|
||||
def csr_find_scale_param(x, lower=-1, upper=1):
|
||||
assert isinstance(x, sparse.csr_matrix)
|
||||
assert lower < upper
|
||||
l, n = x.shape
|
||||
feat_min = x.min(axis=0).toarray().flatten()
|
||||
feat_max = x.max(axis=0).toarray().flatten()
|
||||
coef = (feat_max - feat_min) / (upper - lower)
|
||||
coef[coef != 0] = 1.0 / coef[coef != 0]
|
||||
|
||||
# (x - ones(l,1) * feat_min') * diag(coef) + lower
|
||||
# = x * diag(coef) - ones(l, 1) * (feat_min' * diag(coef)) + lower
|
||||
# = x * diag(coef) + ones(l, 1) * (-feat_min' * diag(coef) + lower)
|
||||
# = x * diag(coef) + ones(l, 1) * offset'
|
||||
offset = -feat_min * coef + lower
|
||||
offset[coef == 0] = 0
|
||||
|
||||
if sum(offset != 0) * l > 3 * x.getnnz():
|
||||
print(
|
||||
"WARNING: The #nonzeros of the scaled data is at least 2 times larger than the original one.\n"
|
||||
"If feature values are non-negative and sparse, set lower=0 rather than the default lower=-1.",
|
||||
file=sys.stderr)
|
||||
|
||||
return {'coef':coef, 'offset':offset}
|
||||
|
||||
def csr_scale(x, scale_param):
|
||||
assert isinstance(x, sparse.csr_matrix)
|
||||
|
||||
offset = scale_param['offset']
|
||||
coef = scale_param['coef']
|
||||
assert len(coef) == len(offset)
|
||||
|
||||
l, n = x.shape
|
||||
|
||||
if not n == len(coef):
|
||||
print("WARNING: The dimension of scaling parameters and feature number do not match.", file=sys.stderr)
|
||||
coef = coef.resize(n) # zeros padded if n > len(coef)
|
||||
offset = offset.resize(n)
|
||||
|
||||
# scaled_x = x * diag(coef) + ones(l, 1) * offset'
|
||||
offset = sparse.csr_matrix(offset.reshape(1, n))
|
||||
offset = sparse.vstack([offset] * l, format='csr', dtype=x.dtype)
|
||||
scaled_x = x.dot(sparse.diags(coef, 0, shape=(n, n))) + offset
|
||||
|
||||
if scaled_x.getnnz() > x.getnnz():
|
||||
print(
|
||||
"WARNING: original #nonzeros %d\n" % x.getnnz() +
|
||||
" > new #nonzeros %d\n" % scaled_x.getnnz() +
|
||||
"If feature values are non-negative and sparse, get scale_param by setting lower=0 rather than the default lower=-1.",
|
||||
file=sys.stderr)
|
||||
|
||||
return scaled_x
|
Reference in New Issue
Block a user