First commit

2025-06-22 00:31:33 +02:00
parent a52c20d1fb
commit 4bdbcad256
110 changed files with 31991 additions and 1 deletions
--- a/libsvm-3.36/python/libsvm/commonutil.py
+++ b/libsvm-3.36/python/libsvm/commonutil.py
@@ -0,0 +1,189 @@
+from __future__ import print_function
+from array import array
+import sys
+
+try:
+    import numpy as np
+    import scipy
+    from scipy import sparse
+except:
+    scipy = None
+
+
+__all__ = ['svm_read_problem', 'evaluations', 'csr_find_scale_param', 'csr_scale']
+
+def svm_read_problem(data_source, return_scipy=False):
+    """
+    svm_read_problem(data_source, return_scipy=False) -> [y, x], y: list, x: list of dictionary
+    svm_read_problem(data_source, return_scipy=True)  -> [y, x], y: ndarray, x: csr_matrix
+
+    Read LIBSVM-format data from data_source and return labels y
+    and data instances x.
+    """
+    if scipy != None and return_scipy:
+        prob_y = array('d')
+        prob_x = array('d')
+        row_ptr = array('l', [0])
+        col_idx = array('l')
+    else:
+        prob_y = []
+        prob_x = []
+        row_ptr = [0]
+        col_idx = []
+    indx_start = 1
+
+    if hasattr(data_source, "read"):
+        file = data_source
+    else:
+        file = open(data_source)
+    try:
+        for line in file:
+            line = line.split(None, 1)
+            # In case an instance with all zero features
+            if len(line) == 1: line += ['']
+            label, features = line
+            prob_y.append(float(label))
+            if scipy != None and return_scipy:
+                nz = 0
+                for e in features.split():
+                    ind, val = e.split(":")
+                    if ind == '0':
+                        indx_start = 0
+                    val = float(val)
+                    if val != 0:
+                        col_idx.append(int(ind)-indx_start)
+                        prob_x.append(val)
+                        nz += 1
+                row_ptr.append(row_ptr[-1]+nz)
+            else:
+                xi = {}
+                for e in features.split():
+                    ind, val = e.split(":")
+                    xi[int(ind)] = float(val)
+                prob_x += [xi]
+    except Exception as err_msg:
+        raise err_msg
+    finally:
+        if not hasattr(data_source, "read"):
+            # close file only if it was created by us
+            file.close()
+
+    if scipy != None and return_scipy:
+        prob_y = np.frombuffer(prob_y, dtype='d')
+        prob_x = np.frombuffer(prob_x, dtype='d')
+        col_idx = np.frombuffer(col_idx, dtype='l')
+        row_ptr = np.frombuffer(row_ptr, dtype='l')
+        prob_x = sparse.csr_matrix((prob_x, col_idx, row_ptr))
+    return (prob_y, prob_x)
+
+def evaluations_scipy(ty, pv):
+    """
+    evaluations_scipy(ty, pv) -> (ACC, MSE, SCC)
+    ty, pv: ndarray
+
+    Calculate accuracy, mean squared error and squared correlation coefficient
+    using the true values (ty) and predicted values (pv).
+    """
+    if not (scipy != None and isinstance(ty, np.ndarray) and isinstance(pv, np.ndarray)):
+        raise TypeError("type of ty and pv must be ndarray")
+    if len(ty) != len(pv):
+        raise ValueError("len(ty) must be equal to len(pv)")
+    ACC = 100.0*(ty == pv).mean()
+    MSE = ((ty - pv)**2).mean()
+    l = len(ty)
+    sumv = pv.sum()
+    sumy = ty.sum()
+    sumvy = (pv*ty).sum()
+    sumvv = (pv*pv).sum()
+    sumyy = (ty*ty).sum()
+    with np.errstate(all = 'raise'):
+        try:
+            SCC = ((l*sumvy-sumv*sumy)*(l*sumvy-sumv*sumy))/((l*sumvv-sumv*sumv)*(l*sumyy-sumy*sumy))
+        except:
+            SCC = float('nan')
+    return (float(ACC), float(MSE), float(SCC))
+
+def evaluations(ty, pv, useScipy = True):
+    """
+    evaluations(ty, pv, useScipy) -> (ACC, MSE, SCC)
+    ty, pv: list, tuple or ndarray
+    useScipy: convert ty, pv to ndarray, and use scipy functions for the evaluation
+
+    Calculate accuracy, mean squared error and squared correlation coefficient
+    using the true values (ty) and predicted values (pv).
+    """
+    if scipy != None and useScipy:
+        return evaluations_scipy(np.asarray(ty), np.asarray(pv))
+    if len(ty) != len(pv):
+        raise ValueError("len(ty) must be equal to len(pv)")
+    total_correct = total_error = 0
+    sumv = sumy = sumvv = sumyy = sumvy = 0
+    for v, y in zip(pv, ty):
+        if y == v:
+            total_correct += 1
+        total_error += (v-y)*(v-y)
+        sumv += v
+        sumy += y
+        sumvv += v*v
+        sumyy += y*y
+        sumvy += v*y
+    l = len(ty)
+    ACC = 100.0*total_correct/l
+    MSE = total_error/l
+    try:
+        SCC = ((l*sumvy-sumv*sumy)*(l*sumvy-sumv*sumy))/((l*sumvv-sumv*sumv)*(l*sumyy-sumy*sumy))
+    except:
+        SCC = float('nan')
+    return (float(ACC), float(MSE), float(SCC))
+
+def csr_find_scale_param(x, lower=-1, upper=1):
+    assert isinstance(x, sparse.csr_matrix)
+    assert lower < upper
+    l, n = x.shape
+    feat_min = x.min(axis=0).toarray().flatten()
+    feat_max = x.max(axis=0).toarray().flatten()
+    coef = (feat_max - feat_min) / (upper - lower)
+    coef[coef != 0] = 1.0 / coef[coef != 0]
+
+    # (x - ones(l,1) * feat_min') * diag(coef) + lower
+    # = x * diag(coef) - ones(l, 1) * (feat_min' * diag(coef)) + lower
+    # = x * diag(coef) + ones(l, 1) * (-feat_min' * diag(coef) + lower)
+    # = x * diag(coef) + ones(l, 1) * offset'
+    offset = -feat_min * coef + lower
+    offset[coef == 0] = 0
+
+    if sum(offset != 0) * l > 3 * x.getnnz():
+        print(
+            "WARNING: The #nonzeros of the scaled data is at least 2 times larger than the original one.\n"
+            "If feature values are non-negative and sparse, set lower=0 rather than the default lower=-1.",
+            file=sys.stderr)
+
+    return {'coef':coef, 'offset':offset}
+
+def csr_scale(x, scale_param):
+    assert isinstance(x, sparse.csr_matrix)
+
+    offset = scale_param['offset']
+    coef = scale_param['coef']
+    assert len(coef) == len(offset)
+
+    l, n = x.shape
+
+    if not n == len(coef):
+        print("WARNING: The dimension of scaling parameters and feature number do not match.", file=sys.stderr)
+        coef = coef.resize(n) # zeros padded if n > len(coef)
+        offset = offset.resize(n)
+
+    # scaled_x = x * diag(coef) + ones(l, 1) * offset'
+    offset = sparse.csr_matrix(offset.reshape(1, n))
+    offset = sparse.vstack([offset] * l, format='csr', dtype=x.dtype)
+    scaled_x = x.dot(sparse.diags(coef, 0, shape=(n, n))) + offset
+
+    if scaled_x.getnnz() > x.getnnz():
+        print(
+            "WARNING: original #nonzeros %d\n" % x.getnnz() +
+            "       > new      #nonzeros %d\n" % scaled_x.getnnz() +
+            "If feature values are non-negative and sparse, get scale_param by setting lower=0 rather than the default lower=-1.",
+            file=sys.stderr)
+
+    return scaled_x