From 2ab828b400555b8207222675bd3b2a9768e0e96d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Tue, 14 Feb 2023 23:02:22 +0100 Subject: [PATCH 1/9] Add glass and test to sample --- sample/sample.cpp | 6 +- tests/datasets/glass.arff | 332 ++++++++++++++++++++++++++++++++++++++ tests/datasets/test.arff | 180 +++++++++++++++++++++ 3 files changed, 516 insertions(+), 2 deletions(-) create mode 100755 tests/datasets/glass.arff create mode 100755 tests/datasets/test.arff diff --git a/sample/sample.cpp b/sample/sample.cpp index db784c8..18efdb9 100644 --- a/sample/sample.cpp +++ b/sample/sample.cpp @@ -16,10 +16,12 @@ int main(int argc, char** argv) {"mfeat-factors", true}, {"iris", true}, {"letter", true}, - {"kdd_JapaneseVowels", false} + {"glass", true}, + {"kdd_JapaneseVowels", false}, + {"test", true} }; if (argc != 2 || datasets.find(argv[1]) == datasets.end()) { - cout << "Usage: " << argv[0] << " {mfeat-factors, iris, letter, kdd_JapaneseVowels}" << endl; + cout << "Usage: " << argv[0] << " {mfeat-factors, glass, iris, letter, kdd_JapaneseVowels, test}" << endl; return 1; } diff --git a/tests/datasets/glass.arff b/tests/datasets/glass.arff new file mode 100755 index 0000000..abd9e3c --- /dev/null +++ b/tests/datasets/glass.arff @@ -0,0 +1,332 @@ +% 1. Title: Glass Identification Database +% +% 2. Sources: +% (a) Creator: B. German +% -- Central Research Establishment +% Home Office Forensic Science Service +% Aldermaston, Reading, Berkshire RG7 4PN +% (b) Donor: Vina Spiehler, Ph.D., DABFT +% Diagnostic Products Corporation +% (213) 776-0180 (ext 3014) +% (c) Date: September, 1987 +% +% 3. Past Usage: +% -- Rule Induction in Forensic Science +% -- Ian W. Evett and Ernest J. Spiehler +% -- Central Research Establishment +% Home Office Forensic Science Service +% Aldermaston, Reading, Berkshire RG7 4PN +% -- Unknown technical note number (sorry, not listed here) +% -- General Results: nearest neighbor held its own with respect to the +% rule-based system +% +% 4. Relevant Information:n +% Vina conducted a comparison test of her rule-based system, BEAGLE, the +% nearest-neighbor algorithm, and discriminant analysis. BEAGLE is +% a product available through VRS Consulting, Inc.; 4676 Admiralty Way, +% Suite 206; Marina Del Ray, CA 90292 (213) 827-7890 and FAX: -3189. +% In determining whether the glass was a type of "float" glass or not, +% the following results were obtained (# incorrect answers): +% +% Type of Sample Beagle NN DA +% Windows that were float processed (87) 10 12 21 +% Windows that were not: (76) 19 16 22 +% +% The study of classification of types of glass was motivated by +% criminological investigation. At the scene of the crime, the glass left +% can be used as evidence...if it is correctly identified! +% +% 5. Number of Instances: 214 +% +% 6. Number of Attributes: 10 (including an Id#) plus the class attribute +% -- all attributes are continuously valued +% +% 7. Attribute Information: +% 1. Id number: 1 to 214 +% 2. RI: refractive index +% 3. Na: Sodium (unit measurement: weight percent in corresponding oxide, as +% are attributes 4-10) +% 4. Mg: Magnesium +% 5. Al: Aluminum +% 6. Si: Silicon +% 7. K: Potassium +% 8. Ca: Calcium +% 9. Ba: Barium +% 10. Fe: Iron +% 11. Type of glass: (class attribute) +% -- 1 building_windows_float_processed +% -- 2 building_windows_non_float_processed +% -- 3 vehicle_windows_float_processed +% -- 4 vehicle_windows_non_float_processed (none in this database) +% -- 5 containers +% -- 6 tableware +% -- 7 headlamps +% +% 8. Missing Attribute Values: None +% +% Summary Statistics: +% Attribute: Min Max Mean SD Correlation with class +% 2. RI: 1.5112 1.5339 1.5184 0.0030 -0.1642 +% 3. Na: 10.73 17.38 13.4079 0.8166 0.5030 +% 4. Mg: 0 4.49 2.6845 1.4424 -0.7447 +% 5. Al: 0.29 3.5 1.4449 0.4993 0.5988 +% 6. Si: 69.81 75.41 72.6509 0.7745 0.1515 +% 7. K: 0 6.21 0.4971 0.6522 -0.0100 +% 8. Ca: 5.43 16.19 8.9570 1.4232 0.0007 +% 9. Ba: 0 3.15 0.1750 0.4972 0.5751 +% 10. Fe: 0 0.51 0.0570 0.0974 -0.1879 +% +% 9. Class Distribution: (out of 214 total instances) +% -- 163 Window glass (building windows and vehicle windows) +% -- 87 float processed +% -- 70 building windows +% -- 17 vehicle windows +% -- 76 non-float processed +% -- 76 building windows +% -- 0 vehicle windows +% -- 51 Non-window glass +% -- 13 containers +% -- 9 tableware +% -- 29 headlamps +% +% +% +% +% +% +% +% Relabeled values in attribute 'Type' +% From: '1' To: 'build wind float' +% From: '2' To: 'build wind non-float' +% From: '3' To: 'vehic wind float' +% From: '4' To: 'vehic wind non-float' +% From: '5' To: containers +% From: '6' To: tableware +% From: '7' To: headlamps +% +@relation Glass +@attribute 'RI' real +@attribute 'Na' real +@attribute 'Mg' real +@attribute 'Al' real +@attribute 'Si' real +@attribute 'K' real +@attribute 'Ca' real +@attribute 'Ba' real +@attribute 'Fe' real +@attribute 'Type' { 'build wind float', 'build wind non-float', 'vehic wind float', 'vehic wind non-float', containers, tableware, headlamps} +@data +1.51793,12.79,3.5,1.12,73.03,0.64,8.77,0,0,'build wind float' +1.51643,12.16,3.52,1.35,72.89,0.57,8.53,0,0,'vehic wind float' +1.51793,13.21,3.48,1.41,72.64,0.59,8.43,0,0,'build wind float' +1.51299,14.4,1.74,1.54,74.55,0,7.59,0,0,tableware +1.53393,12.3,0,1,70.16,0.12,16.19,0,0.24,'build wind non-float' +1.51655,12.75,2.85,1.44,73.27,0.57,8.79,0.11,0.22,'build wind non-float' +1.51779,13.64,3.65,0.65,73,0.06,8.93,0,0,'vehic wind float' +1.51837,13.14,2.84,1.28,72.85,0.55,9.07,0,0,'build wind float' +1.51545,14.14,0,2.68,73.39,0.08,9.07,0.61,0.05,headlamps +1.51789,13.19,3.9,1.3,72.33,0.55,8.44,0,0.28,'build wind non-float' +1.51625,13.36,3.58,1.49,72.72,0.45,8.21,0,0,'build wind non-float' +1.51743,12.2,3.25,1.16,73.55,0.62,8.9,0,0.24,'build wind non-float' +1.52223,13.21,3.77,0.79,71.99,0.13,10.02,0,0,'build wind float' +1.52121,14.03,3.76,0.58,71.79,0.11,9.65,0,0,'vehic wind float' +1.51665,13.14,3.45,1.76,72.48,0.6,8.38,0,0.17,'vehic wind float' +1.51707,13.48,3.48,1.71,72.52,0.62,7.99,0,0,'build wind non-float' +1.51719,14.75,0,2,73.02,0,8.53,1.59,0.08,headlamps +1.51629,12.71,3.33,1.49,73.28,0.67,8.24,0,0,'build wind non-float' +1.51994,13.27,0,1.76,73.03,0.47,11.32,0,0,containers +1.51811,12.96,2.96,1.43,72.92,0.6,8.79,0.14,0,'build wind non-float' +1.52152,13.05,3.65,0.87,72.22,0.19,9.85,0,0.17,'build wind float' +1.52475,11.45,0,1.88,72.19,0.81,13.24,0,0.34,'build wind non-float' +1.51841,12.93,3.74,1.11,72.28,0.64,8.96,0,0.22,'build wind non-float' +1.51754,13.39,3.66,1.19,72.79,0.57,8.27,0,0.11,'build wind float' +1.52058,12.85,1.61,2.17,72.18,0.76,9.7,0.24,0.51,containers +1.51569,13.24,3.49,1.47,73.25,0.38,8.03,0,0,'build wind non-float' +1.5159,12.82,3.52,1.9,72.86,0.69,7.97,0,0,'build wind non-float' +1.51683,14.56,0,1.98,73.29,0,8.52,1.57,0.07,headlamps +1.51687,13.23,3.54,1.48,72.84,0.56,8.1,0,0,'build wind non-float' +1.5161,13.33,3.53,1.34,72.67,0.56,8.33,0,0,'vehic wind float' +1.51674,12.87,3.56,1.64,73.14,0.65,7.99,0,0,'build wind non-float' +1.51832,13.33,3.34,1.54,72.14,0.56,8.99,0,0,'vehic wind float' +1.51115,17.38,0,0.34,75.41,0,6.65,0,0,tableware +1.51645,13.44,3.61,1.54,72.39,0.66,8.03,0,0,'build wind non-float' +1.51755,13,3.6,1.36,72.99,0.57,8.4,0,0.11,'build wind float' +1.51571,12.72,3.46,1.56,73.2,0.67,8.09,0,0.24,'build wind float' +1.51596,12.79,3.61,1.62,72.97,0.64,8.07,0,0.26,'build wind float' +1.5173,12.35,2.72,1.63,72.87,0.7,9.23,0,0,'build wind non-float' +1.51662,12.85,3.51,1.44,73.01,0.68,8.23,0.06,0.25,'build wind non-float' +1.51409,14.25,3.09,2.08,72.28,1.1,7.08,0,0,'build wind non-float' +1.51797,12.74,3.48,1.35,72.96,0.64,8.68,0,0,'build wind float' +1.51806,13,3.8,1.08,73.07,0.56,8.38,0,0.12,'build wind non-float' +1.51627,13,3.58,1.54,72.83,0.61,8.04,0,0,'build wind non-float' +1.5159,13.24,3.34,1.47,73.1,0.39,8.22,0,0,'build wind non-float' +1.51934,13.64,3.54,0.75,72.65,0.16,8.89,0.15,0.24,'vehic wind float' +1.51755,12.71,3.42,1.2,73.2,0.59,8.64,0,0,'build wind float' +1.51514,14.01,2.68,3.5,69.89,1.68,5.87,2.2,0,containers +1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0,0,'build wind float' +1.51784,13.08,3.49,1.28,72.86,0.6,8.49,0,0,'build wind float' +1.52177,13.2,3.68,1.15,72.75,0.54,8.52,0,0,'build wind non-float' +1.51753,12.57,3.47,1.38,73.39,0.6,8.55,0,0.06,'build wind float' +1.51851,13.2,3.63,1.07,72.83,0.57,8.41,0.09,0.17,'build wind non-float' +1.51743,13.3,3.6,1.14,73.09,0.58,8.17,0,0,'build wind float' +1.51593,13.09,3.59,1.52,73.1,0.67,7.83,0,0,'build wind non-float' +1.5164,14.37,0,2.74,72.85,0,9.45,0.54,0,headlamps +1.51735,13.02,3.54,1.69,72.73,0.54,8.44,0,0.07,'build wind float' +1.52247,14.86,2.2,2.06,70.26,0.76,9.76,0,0,headlamps +1.52099,13.69,3.59,1.12,71.96,0.09,9.4,0,0,'build wind float' +1.51769,13.65,3.66,1.11,72.77,0.11,8.6,0,0,'vehic wind float' +1.51846,13.41,3.89,1.33,72.38,0.51,8.28,0,0,'build wind non-float' +1.51848,13.64,3.87,1.27,71.96,0.54,8.32,0,0.32,'build wind non-float' +1.51905,13.6,3.62,1.11,72.64,0.14,8.76,0,0,'build wind float' +1.51567,13.29,3.45,1.21,72.74,0.56,8.57,0,0,'build wind float' +1.52213,14.21,3.82,0.47,71.77,0.11,9.57,0,0,'build wind float' +1.5232,13.72,3.72,0.51,71.75,0.09,10.06,0,0.16,'build wind float' +1.51556,13.87,0,2.54,73.23,0.14,9.41,0.81,0.01,headlamps +1.51926,13.2,3.33,1.28,72.36,0.6,9.14,0,0.11,'build wind float' +1.52211,14.19,3.78,0.91,71.36,0.23,9.14,0,0.37,'vehic wind float' +1.53125,10.73,0,2.1,69.81,0.58,13.3,3.15,0.28,'build wind non-float' +1.52152,13.05,3.65,0.87,72.32,0.19,9.85,0,0.17,'build wind float' +1.51829,14.46,2.24,1.62,72.38,0,9.26,0,0,tableware +1.51892,13.46,3.83,1.26,72.55,0.57,8.21,0,0.14,'build wind non-float' +1.51888,14.99,0.78,1.74,72.5,0,9.95,0,0,tableware +1.51829,13.24,3.9,1.41,72.33,0.55,8.31,0,0.1,'build wind non-float' +1.523,13.31,3.58,0.82,71.99,0.12,10.17,0,0.03,'build wind float' +1.51652,13.56,3.57,1.47,72.45,0.64,7.96,0,0,'build wind non-float' +1.51768,12.56,3.52,1.43,73.15,0.57,8.54,0,0,'build wind float' +1.51215,12.99,3.47,1.12,72.98,0.62,8.35,0,0.31,'build wind float' +1.51646,13.04,3.4,1.26,73.01,0.52,8.58,0,0,'vehic wind float' +1.51721,12.87,3.48,1.33,73.04,0.56,8.43,0,0,'build wind float' +1.51763,12.8,3.66,1.27,73.01,0.6,8.56,0,0,'build wind float' +1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0,0,'build wind float' +1.52127,14.32,3.9,0.83,71.5,0,9.49,0,0,'vehic wind float' +1.51779,13.21,3.39,1.33,72.76,0.59,8.59,0,0,'build wind float' +1.52171,11.56,1.88,1.56,72.86,0.47,11.41,0,0,containers +1.518,13.71,3.93,1.54,71.81,0.54,8.21,0,0.15,'build wind non-float' +1.52777,12.64,0,0.67,72.02,0.06,14.4,0,0,'build wind non-float' +1.5175,12.82,3.55,1.49,72.75,0.54,8.52,0,0.19,'build wind float' +1.51764,12.98,3.54,1.21,73,0.65,8.53,0,0,'build wind float' +1.52177,13.75,1.01,1.36,72.19,0.33,11.14,0,0,'build wind non-float' +1.51645,14.94,0,1.87,73.11,0,8.67,1.38,0,headlamps +1.51786,12.73,3.43,1.19,72.95,0.62,8.76,0,0.3,'build wind float' +1.52152,13.12,3.58,0.9,72.2,0.23,9.82,0,0.16,'build wind float' +1.51937,13.79,2.41,1.19,72.76,0,9.77,0,0,tableware +1.51514,14.85,0,2.42,73.72,0,8.39,0.56,0,headlamps +1.52172,13.48,3.74,0.9,72.01,0.18,9.61,0,0.07,'build wind float' +1.51732,14.95,0,1.8,72.99,0,8.61,1.55,0,headlamps +1.5202,13.98,1.35,1.63,71.76,0.39,10.56,0,0.18,'build wind non-float' +1.51605,12.9,3.44,1.45,73.06,0.44,8.27,0,0,'build wind non-float' +1.51847,13.1,3.97,1.19,72.44,0.6,8.43,0,0,'build wind non-float' +1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0,0,'build wind float' +1.51673,13.3,3.64,1.53,72.53,0.65,8.03,0,0.29,'build wind non-float' +1.52365,15.79,1.83,1.31,70.43,0.31,8.61,1.68,0,headlamps +1.51685,14.92,0,1.99,73.06,0,8.4,1.59,0,headlamps +1.51658,14.8,0,1.99,73.11,0,8.28,1.71,0,headlamps +1.51316,13.02,0,3.04,70.48,6.21,6.96,0,0,containers +1.51709,13,3.47,1.79,72.72,0.66,8.18,0,0,'build wind non-float' +1.51727,14.7,0,2.34,73.28,0,8.95,0.66,0,headlamps +1.51898,13.58,3.35,1.23,72.08,0.59,8.91,0,0,'build wind float' +1.51969,12.64,0,1.65,73.75,0.38,11.53,0,0,containers +1.5182,12.62,2.76,0.83,73.81,0.35,9.42,0,0.2,'build wind non-float' +1.51617,14.95,0,2.27,73.3,0,8.71,0.67,0,headlamps +1.51911,13.9,3.73,1.18,72.12,0.06,8.89,0,0,'build wind float' +1.51651,14.38,0,1.94,73.61,0,8.48,1.57,0,headlamps +1.51694,12.86,3.58,1.31,72.61,0.61,8.79,0,0,'vehic wind float' +1.52315,13.44,3.34,1.23,72.38,0.6,8.83,0,0,headlamps +1.52068,13.55,2.09,1.67,72.18,0.53,9.57,0.27,0.17,'build wind non-float' +1.51838,14.32,3.26,2.22,71.25,1.46,5.79,1.63,0,headlamps +1.51818,13.72,0,0.56,74.45,0,10.99,0,0,'build wind non-float' +1.51769,12.45,2.71,1.29,73.7,0.56,9.06,0,0.24,'build wind float' +1.5166,12.99,3.18,1.23,72.97,0.58,8.81,0,0.24,'build wind non-float' +1.51589,12.88,3.43,1.4,73.28,0.69,8.05,0,0.24,'build wind float' +1.5241,13.83,2.9,1.17,71.15,0.08,10.79,0,0,'build wind non-float' +1.52725,13.8,3.15,0.66,70.57,0.08,11.64,0,0,'build wind non-float' +1.52119,12.97,0.33,1.51,73.39,0.13,11.27,0,0.28,containers +1.51748,12.86,3.56,1.27,73.21,0.54,8.38,0,0.17,'build wind float' +1.51653,11.95,0,1.19,75.18,2.7,8.93,0,0,headlamps +1.51623,14.14,0,2.88,72.61,0.08,9.18,1.06,0,headlamps +1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0,0,'build wind float' +1.51763,12.61,3.59,1.31,73.29,0.58,8.5,0,0,'build wind float' +1.51596,13.02,3.56,1.54,73.11,0.72,7.9,0,0,'build wind non-float' +1.51674,12.79,3.52,1.54,73.36,0.66,7.9,0,0,'build wind non-float' +1.52065,14.36,0,2.02,73.42,0,8.44,1.64,0,headlamps +1.51768,12.65,3.56,1.3,73.08,0.61,8.69,0,0.14,'build wind float' +1.52369,13.44,0,1.58,72.22,0.32,12.24,0,0,containers +1.51756,13.15,3.61,1.05,73.24,0.57,8.24,0,0,'build wind float' +1.51754,13.48,3.74,1.17,72.99,0.59,8.03,0,0,'build wind float' +1.51711,12.89,3.62,1.57,72.96,0.61,8.11,0,0,'build wind non-float' +1.5221,13.73,3.84,0.72,71.76,0.17,9.74,0,0,'build wind float' +1.51594,13.09,3.52,1.55,72.87,0.68,8.05,0,0.09,'build wind non-float' +1.51784,12.68,3.67,1.16,73.11,0.61,8.7,0,0,'build wind float' +1.51909,13.89,3.53,1.32,71.81,0.51,8.78,0.11,0,'build wind float' +1.51977,13.81,3.58,1.32,71.72,0.12,8.67,0.69,0,'build wind float' +1.51666,12.86,0,1.83,73.88,0.97,10.17,0,0,containers +1.51631,13.34,3.57,1.57,72.87,0.61,7.89,0,0,'build wind non-float' +1.51872,12.93,3.66,1.56,72.51,0.58,8.55,0,0.12,'build wind non-float' +1.51708,13.72,3.68,1.81,72.06,0.64,7.88,0,0,'build wind non-float' +1.52081,13.78,2.28,1.43,71.99,0.49,9.85,0,0.17,'build wind non-float' +1.51574,14.86,3.67,1.74,71.87,0.16,7.36,0,0.12,'build wind non-float' +1.51813,13.43,3.98,1.18,72.49,0.58,8.15,0,0,'build wind non-float' +1.51131,13.69,3.2,1.81,72.81,1.76,5.43,1.19,0,headlamps +1.52227,14.17,3.81,0.78,71.35,0,9.69,0,0,'build wind float' +1.52614,13.7,0,1.36,71.24,0.19,13.44,0,0.1,'build wind non-float' +1.51811,13.33,3.85,1.25,72.78,0.52,8.12,0,0,'build wind non-float' +1.51655,13.41,3.39,1.28,72.64,0.52,8.65,0,0,'vehic wind float' +1.51751,12.81,3.57,1.35,73.02,0.62,8.59,0,0,'build wind float' +1.51508,15.15,0,2.25,73.5,0,8.34,0.63,0,headlamps +1.51915,12.73,1.85,1.86,72.69,0.6,10.09,0,0,containers +1.51966,14.77,3.75,0.29,72.02,0.03,9,0,0,'build wind float' +1.51844,13.25,3.76,1.32,72.4,0.58,8.42,0,0,'build wind non-float' +1.52664,11.23,0,0.77,73.21,0,14.68,0,0,'build wind non-float' +1.52172,13.51,3.86,0.88,71.79,0.23,9.54,0,0.11,'build wind float' +1.51602,14.85,0,2.38,73.28,0,8.76,0.64,0.09,headlamps +1.51321,13,0,3.02,70.7,6.21,6.93,0,0,containers +1.52739,11.02,0,0.75,73.08,0,14.96,0,0,'build wind non-float' +1.52213,14.21,3.82,0.47,71.77,0.11,9.57,0,0,'build wind float' +1.51747,12.84,3.5,1.14,73.27,0.56,8.55,0,0,'build wind float' +1.51839,12.85,3.67,1.24,72.57,0.62,8.68,0,0.35,'build wind non-float' +1.51646,13.41,3.55,1.25,72.81,0.68,8.1,0,0,'build wind non-float' +1.51609,15.01,0,2.51,73.05,0.05,8.83,0.53,0,headlamps +1.51667,12.94,3.61,1.26,72.75,0.56,8.6,0,0,'build wind non-float' +1.51588,13.12,3.41,1.58,73.26,0.07,8.39,0,0.19,'build wind non-float' +1.52667,13.99,3.7,0.71,71.57,0.02,9.82,0,0.1,'build wind float' +1.51831,14.39,0,1.82,72.86,1.41,6.47,2.88,0,headlamps +1.51918,14.04,3.58,1.37,72.08,0.56,8.3,0,0,'build wind float' +1.51613,13.88,1.78,1.79,73.1,0,8.67,0.76,0,headlamps +1.52196,14.36,3.85,0.89,71.36,0.15,9.15,0,0,'build wind float' +1.51824,12.87,3.48,1.29,72.95,0.6,8.43,0,0,'build wind float' +1.52151,11.03,1.71,1.56,73.44,0.58,11.62,0,0,containers +1.51969,14.56,0,0.56,73.48,0,11.22,0,0,tableware +1.51618,13.01,3.5,1.48,72.89,0.6,8.12,0,0,'build wind non-float' +1.51645,13.4,3.49,1.52,72.65,0.67,8.08,0,0.1,'build wind non-float' +1.51796,13.5,3.36,1.63,71.94,0.57,8.81,0,0.09,'vehic wind float' +1.52222,14.43,0,1,72.67,0.1,11.52,0,0.08,'build wind non-float' +1.51783,12.69,3.54,1.34,72.95,0.57,8.75,0,0,'build wind float' +1.51711,14.23,0,2.08,73.36,0,8.62,1.67,0,headlamps +1.51736,12.78,3.62,1.29,72.79,0.59,8.7,0,0,'build wind float' +1.51808,13.43,2.87,1.19,72.84,0.55,9.03,0,0,'build wind float' +1.5167,13.24,3.57,1.38,72.7,0.56,8.44,0,0.1,'vehic wind float' +1.52043,13.38,0,1.4,72.25,0.33,12.5,0,0,containers +1.519,13.49,3.48,1.35,71.95,0.55,9,0,0,'build wind float' +1.51778,13.21,2.81,1.29,72.98,0.51,9.02,0,0.09,'build wind float' +1.51905,14,2.39,1.56,72.37,0,9.57,0,0,tableware +1.51531,14.38,0,2.66,73.1,0.04,9.08,0.64,0,headlamps +1.51916,14.15,0,2.09,72.74,0,10.88,0,0,tableware +1.51841,13.02,3.62,1.06,72.34,0.64,9.13,0,0.15,'build wind non-float' +1.5159,13.02,3.58,1.51,73.12,0.69,7.96,0,0,'build wind non-float' +1.51593,13.25,3.45,1.43,73.17,0.61,7.86,0,0,'build wind non-float' +1.5164,12.55,3.48,1.87,73.23,0.63,8.08,0,0.09,'build wind non-float' +1.51663,12.93,3.54,1.62,72.96,0.64,8.03,0,0.21,'build wind non-float' +1.5169,13.33,3.54,1.61,72.54,0.68,8.11,0,0,'build wind non-float' +1.51869,13.19,3.37,1.18,72.72,0.57,8.83,0,0.16,'build wind float' +1.51776,13.53,3.41,1.52,72.04,0.58,8.79,0,0,'vehic wind float' +1.51775,12.85,3.48,1.23,72.97,0.61,8.56,0.09,0.22,'build wind float' +1.5186,13.36,3.43,1.43,72.26,0.51,8.6,0,0,'build wind non-float' +1.5172,13.38,3.5,1.15,72.85,0.5,8.43,0,0,'build wind float' +1.51623,14.2,0,2.79,73.46,0.04,9.04,0.4,0.09,headlamps +1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0,0,'build wind float' +1.51761,12.81,3.54,1.23,73.24,0.58,8.39,0,0,'build wind float' +1.5161,13.42,3.4,1.22,72.69,0.59,8.32,0,0,'vehic wind float' +1.51592,12.86,3.52,2.12,72.66,0.69,7.97,0,0,'build wind non-float' +1.51613,13.92,3.52,1.25,72.88,0.37,7.94,0,0.14,'build wind non-float' +1.51689,12.67,2.88,1.71,73.21,0.73,8.54,0,0,'build wind non-float' +1.51852,14.09,2.19,1.66,72.67,0,9.32,0,0,tableware diff --git a/tests/datasets/test.arff b/tests/datasets/test.arff new file mode 100755 index 0000000..31d50a1 --- /dev/null +++ b/tests/datasets/test.arff @@ -0,0 +1,180 @@ +% 1. Title: Test Feature extracted from Glass +% + +@RELATION test + +@ATTRIBUTE Mg REAL +@ATTRIBUTE Type {0,1,2,3,4,5,6} + +@DATA +3.5,0 +3.52,1 +1.74,2 +0.0,3 +2.85,3 +3.65,1 +2.84,0 +0.0,4 +3.9,3 +3.58,3 +3.25,3 +3.76,1 +3.45,1 +3.48,3 +0.0,4 +0.0,5 +2.96,3 +3.65,0 +0.0,3 +3.74,3 +3.66,0 +1.61,5 +3.49,3 +3.52,3 +3.54,3 +3.53,1 +3.56,3 +3.34,1 +0.0,2 +3.61,3 +3.6,0 +3.46,0 +2.72,3 +3.51,3 +3.09,3 +3.48,0 +3.8,3 +3.58,3 +3.54,1 +3.42,0 +2.68,5 +3.49,0 +3.68,3 +3.6,0 +3.59,3 +0.0,4 +3.54,0 +2.2,4 +3.59,0 +3.66,1 +3.87,3 +3.45,0 +3.82,0 +3.72,0 +3.33,0 +3.78,1 +2.24,2 +3.83,3 +0.78,2 +3.9,3 +3.58,0 +3.57,3 +3.52,0 +3.47,0 +3.48,0 +3.66,0 +3.62,0 +3.39,0 +0.0,3 +3.55,0 +1.01,3 +0.0,4 +3.43,0 +3.58,0 +0.0,4 +3.74,0 +0.0,4 +3.44,3 +3.97,3 +3.6,0 +3.64,3 +1.83,4 +0.0,4 +0.0,5 +0.0,4 +0.0,5 +0.0,4 +3.73,0 +3.58,1 +3.34,4 +2.09,3 +2.71,0 +3.18,3 +3.43,0 +3.15,3 +3.56,0 +0.0,4 +0.0,4 +4.49,0 +3.59,0 +3.56,3 +3.52,3 +0.0,4 +0.0,5 +3.61,0 +3.74,0 +3.62,3 +3.84,0 +3.67,0 +3.58,0 +0.0,5 +3.66,3 +3.68,3 +2.28,3 +3.67,3 +3.2,4 +3.81,0 +0.0,3 +3.39,1 +3.57,0 +1.85,5 +3.75,0 +3.76,3 +0.0,3 +3.86,0 +0.0,4 +0.0,5 +0.0,3 +3.5,0 +3.67,3 +3.55,3 +0.0,4 +3.61,3 +3.41,3 +3.7,0 +0.0,4 +3.58,0 +1.78,4 +3.85,0 +3.48,0 +1.71,5 +0.0,2 +3.5,3 +3.49,3 +3.36,1 +0.0,3 +3.54,0 +0.0,4 +2.87,0 +3.57,1 +3.48,0 +2.81,0 +0.0,4 +0.0,2 +3.62,3 +3.58,3 +3.45,3 +3.48,3 +3.54,3 +3.54,3 +3.37,0 +3.41,1 +3.48,0 +3.43,3 +3.5,0 +0.0,4 +3.54,0 +3.52,3 +3.52,3 +2.88,3 +2.19,2 From 1c7492d3b637d002afb6c508686ff3bf88d865a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Tue, 14 Feb 2023 23:03:01 +0100 Subject: [PATCH 2/9] Add debug config for sample --- .vscode/tasks.json | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 .vscode/tasks.json diff --git a/.vscode/tasks.json b/.vscode/tasks.json new file mode 100644 index 0000000..5318667 --- /dev/null +++ b/.vscode/tasks.json @@ -0,0 +1,29 @@ +{ + "tasks": [ + { + "type": "cppbuild", + "label": "C/C++: clang++ build active file", + "command": "/usr/bin/clang++", + "args": [ + "-fcolor-diagnostics", + "-fansi-escape-codes", + "-g", + "${file}", + "-o", + "${fileDirname}/${fileBasenameNoExtension}" + ], + "options": { + "cwd": "${fileDirname}" + }, + "problemMatcher": [ + "$gcc" + ], + "group": { + "kind": "build", + "isDefault": true + }, + "detail": "Task generated by Debugger." + } + ], + "version": "2.0.0" +} \ No newline at end of file From e37702dcb068ef61ff6d9274eeb2afb3a2d04af6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Tue, 14 Feb 2023 23:03:23 +0100 Subject: [PATCH 3/9] Fix mistake in final cut value --- CPPFImdlp.cpp | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/CPPFImdlp.cpp b/CPPFImdlp.cpp index 76ff3ce..2e15907 100644 --- a/CPPFImdlp.cpp +++ b/CPPFImdlp.cpp @@ -4,6 +4,7 @@ #include #include "CPPFImdlp.h" #include "Metrics.h" +#include namespace mdlp { CPPFImdlp::CPPFImdlp(int algorithm):algorithm(algorithm), indices(indices_t()), X(samples_t()), y(labels_t()), metrics(Metrics(y, indices)) { @@ -23,6 +24,23 @@ namespace mdlp { } indices = sortIndices(X_, y_); metrics.setData(y, indices); + + + for (auto i=0; i< X.size(); i++) { + if (i% 10 ==0) { + cout << " # Idx --X-- y"< Date: Wed, 15 Feb 2023 13:07:03 +0100 Subject: [PATCH 4/9] Add same_values to getCandidate and fine tune ValueCutPoint --- CPPFImdlp.cpp | 126 ++++++++++++++++++++++++-------------------------- 1 file changed, 60 insertions(+), 66 deletions(-) diff --git a/CPPFImdlp.cpp b/CPPFImdlp.cpp index 2e15907..c70efaf 100644 --- a/CPPFImdlp.cpp +++ b/CPPFImdlp.cpp @@ -4,43 +4,26 @@ #include #include "CPPFImdlp.h" #include "Metrics.h" -#include + namespace mdlp { - CPPFImdlp::CPPFImdlp(int algorithm):algorithm(algorithm), indices(indices_t()), X(samples_t()), y(labels_t()), metrics(Metrics(y, indices)) - { + CPPFImdlp::CPPFImdlp(int algorithm) : algorithm(algorithm), indices(indices_t()), X(samples_t()), y(labels_t()), + metrics(Metrics(y, indices)) { } - CPPFImdlp::~CPPFImdlp() - = default; - CPPFImdlp& CPPFImdlp::fit(samples_t& X_, labels_t& y_) - { + + CPPFImdlp::~CPPFImdlp() = default; + + CPPFImdlp &CPPFImdlp::fit(samples_t &X_, labels_t &y_) { X = X_; y = y_; cutPoints.clear(); if (X.size() != y.size()) { throw invalid_argument("X and y must have the same size"); } - if (X.size() == 0 || y.size() == 0) { + if (X.empty() || y.empty()) { throw invalid_argument("X and y must have at least one element"); } indices = sortIndices(X_, y_); metrics.setData(y, indices); - - - for (auto i=0; i< X.size(); i++) { - if (i% 10 ==0) { - cout << " # Idx --X-- y"< X[t-1] < X[t] @@ -68,9 +51,10 @@ namespace mdlp { } return (previous + actual) / 2; } - tuple CPPFImdlp::completeValueCutPoint(size_t start, size_t cut, size_t end) - { + + tuple CPPFImdlp::completeValueCutPoint(size_t start, size_t cut, size_t end) { size_t idxPrev = cut - 1; + bool fforward = false; precision_t previous, actual; previous = X[indices[idxPrev]]; actual = X[indices[cut]]; @@ -79,14 +63,19 @@ namespace mdlp { previous = X[indices[idxPrev]]; } // get the last equal value of X in the interval - while (actual == X[indices[++cut]] && cut < end); - if (previous == actual && cut < end) - actual = X[indices[cut]]; - cut--; + while (actual == X[indices[cut]] && cut + 1 < end) { + cut++; + fforward = true; + } + if (fforward) + cut--; + // try to get the next value if it can't be found backwards + if (previous == actual && cut + 1 < end) + actual = X[indices[cut + 1]]; return make_tuple((previous + actual) / 2, cut); } - void CPPFImdlp::computeCutPoints(size_t start, size_t end) - { + + void CPPFImdlp::computeCutPoints(size_t start, size_t end) { size_t cut; tuple result; if (end - start < 2) @@ -102,8 +91,8 @@ namespace mdlp { computeCutPoints(cut, end); } } - void CPPFImdlp::computeCutPointsAlternative(size_t start, size_t end) - { + + void CPPFImdlp::computeCutPointsAlternative(size_t start, size_t end) { size_t cut; if (end - start < 2) return; @@ -116,8 +105,8 @@ namespace mdlp { computeCutPointsAlternative(cut, end); } } - void CPPFImdlp::computeCutPointsClassic(size_t start, size_t end) - { + + void CPPFImdlp::computeCutPointsClassic(size_t start, size_t end) { size_t cut; cut = getCandidate(start, end); if (cut == numeric_limits::max() || !mdlp(start, cut, end)) { @@ -135,14 +124,17 @@ namespace mdlp { computeCutPoints(start, cut); computeCutPoints(cut, end); } - size_t CPPFImdlp::getCandidate(size_t start, size_t end) - { + + size_t CPPFImdlp::getCandidate(size_t start, size_t end) { /* Definition 1: A binary discretization for A is determined by selecting the cut point TA for which - E(A, TA; S) is minimal amogst all the candidate cut points. */ + E(A, TA; S) is minimal amongst all the candidate cut points. */ size_t candidate = numeric_limits::max(), elements = end - start; + bool same_values = true; precision_t entropy_left, entropy_right, minEntropy; minEntropy = metrics.entropy(start, end); for (auto idx = start + 1; idx < end; idx++) { + if (X[indices[idx]] != X[indices[idx - 1]]) + same_values = false; // Cutpoints are always on boundaries (definition 2) if (y[indices[idx]] == y[indices[idx - 1]]) continue; @@ -153,10 +145,13 @@ namespace mdlp { candidate = idx; } } + // If all the values of the variable in the interval are the same, it doesn't consider the cut point + if (same_values) + candidate = numeric_limits::max(); return candidate; } - bool CPPFImdlp::mdlp(size_t start, size_t cut, size_t end) - { + + bool CPPFImdlp::mdlp(size_t start, size_t cut, size_t end) { int k, k1, k2; precision_t ig, delta; precision_t ent, ent1, ent2; @@ -172,38 +167,37 @@ namespace mdlp { ent2 = metrics.entropy(cut, end); ig = metrics.informationGain(start, cut, end); delta = log2(pow(3, precision_t(k)) - 2) - - (precision_t(k) * ent - precision_t(k1) * ent1 - precision_t(k2) * ent2); + (precision_t(k) * ent - precision_t(k1) * ent1 - precision_t(k2) * ent2); precision_t term = 1 / N * (log2(N - 1) + delta); return ig > term; } + // Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes - indices_t CPPFImdlp::sortIndices(samples_t& X_, labels_t& y_) - { + indices_t CPPFImdlp::sortIndices(samples_t &X_, labels_t &y_) { indices_t idx(X_.size()); iota(idx.begin(), idx.end(), 0); for (size_t i = 0; i < X_.size(); i++) - stable_sort(idx.begin(), idx.end(), [&X_, &y_](size_t i1, size_t i2) - { - if (X_[i1] == X_[i2]) return y_[i1] < y_[i2]; - else - return X_[i1] < X_[i2]; - }); - return idx; - } - // Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes - indices_t CPPFImdlp::sortIndices1(samples_t& X_) - { - indices_t idx(X_.size()); - iota(idx.begin(), idx.end(), 0); - for (size_t i = 0; i < X_.size(); i++) - stable_sort(idx.begin(), idx.end(), [&X_](size_t i1, size_t i2) - { + stable_sort(idx.begin(), idx.end(), [&X_, &y_](size_t i1, size_t i2) { + if (X_[i1] == X_[i2]) + return y_[i1] < y_[i2]; + else return X_[i1] < X_[i2]; - }); + }); return idx; } - cutPoints_t CPPFImdlp::getCutPoints() - { + + // Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes + indices_t CPPFImdlp::sortIndices1(samples_t &X_) { + indices_t idx(X_.size()); + iota(idx.begin(), idx.end(), 0); + for (size_t i = 0; i < X_.size(); i++) + stable_sort(idx.begin(), idx.end(), [&X_](size_t i1, size_t i2) { + return X_[i1] < X_[i2]; + }); + return idx; + } + + cutPoints_t CPPFImdlp::getCutPoints() { // Remove duplicates and sort cutPoints_t output(cutPoints.size()); set s; From dec12959332d11a780d894cc386aca7510b3298b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Mon, 20 Feb 2023 18:23:05 +0100 Subject: [PATCH 5/9] Remove alternative and Classic Refactor ValueCutPoint Reefactor sameValues in getCandidate --- .github/workflows/build.yml | 26 ++++++ .gitignore | 1 + .vscode/launch.json | 22 +++++ .vscode/settings.json | 5 ++ CMakeLists.txt | 2 +- CPPFImdlp.cpp | 169 +++++++++++++----------------------- CPPFImdlp.h | 12 +-- Metrics.cpp | 6 +- sample/CMakeLists.txt | 2 +- sample/sample.cpp | 20 +++-- tests/ArffFiles.cpp | 4 +- tests/ArffFiles.h | 5 +- tests/FImdlp_unittest.cpp | 125 +++++++++++++------------- typesFImdlp.h | 2 +- 14 files changed, 198 insertions(+), 203 deletions(-) create mode 100644 .github/workflows/build.yml create mode 100644 .vscode/launch.json create mode 100644 .vscode/settings.json diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 0000000..4625a01 --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,26 @@ +name: Build + +on: + push: + branches: + - main + + +jobs: + build: + name: Build + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + with: + fetch-depth: 0 # Shallow clones should be disabled for a better relevancy of analysis + - uses: sonarsource/sonarqube-scan-action@master + env: + SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }} + SONAR_HOST_URL: ${{ secrets.SONAR_HOST_URL }} + # If you wish to fail your job when the Quality Gate is red, uncomment the + # following lines. This would typically be used to fail a deployment. + # - uses: sonarsource/sonarqube-quality-gate-action@master + # timeout-minutes: 5 + # env: + # SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }} diff --git a/.gitignore b/.gitignore index be772d2..23b7ce1 100644 --- a/.gitignore +++ b/.gitignore @@ -35,3 +35,4 @@ .idea cmake-* **/CMakeFiles +sonar-project.properties diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..4d023ae --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,22 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "(lldb) Launch", + "type": "cppdbg", + "request": "launch", + "program": "${workspaceRoot}/sample/build/sample", + "args": [ + "test" + ], + "stopAtEntry": false, + "cwd": "${workspaceRoot}/sample/build/", + "environment": [], + "externalConsole": false, + "MIMode": "lldb" + } + ] +} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..09b14a2 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,5 @@ +{ + "sonarlint.connectedMode.project": { + "projectKey": "rmontanana_mdlp_AYZkjILJHyjW-meBaElG" + } +} \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 382ca27..ff48211 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 3.20) project(mdlp) -set(CMAKE_CXX_STANDARD 14) +set(CMAKE_CXX_STANDARD 11) add_library(mdlp CPPFImdlp.cpp Metrics.cpp) diff --git a/CPPFImdlp.cpp b/CPPFImdlp.cpp index c70efaf..59b9b4b 100644 --- a/CPPFImdlp.cpp +++ b/CPPFImdlp.cpp @@ -6,13 +6,15 @@ #include "Metrics.h" namespace mdlp { - CPPFImdlp::CPPFImdlp(int algorithm) : algorithm(algorithm), indices(indices_t()), X(samples_t()), y(labels_t()), - metrics(Metrics(y, indices)) { - } + CPPFImdlp::CPPFImdlp(): indices(indices_t()), X(samples_t()), y(labels_t()), + metrics(Metrics(y, indices)) + { + } CPPFImdlp::~CPPFImdlp() = default; - CPPFImdlp &CPPFImdlp::fit(samples_t &X_, labels_t &y_) { + CPPFImdlp& CPPFImdlp::fit(samples_t& X_, labels_t& y_) + { X = X_; y = y_; cutPoints.clear(); @@ -24,117 +26,75 @@ namespace mdlp { } indices = sortIndices(X_, y_); metrics.setData(y, indices); - switch (algorithm) { - case 0: - computeCutPoints(0, X.size()); - break; - case 1: - computeCutPointsAlternative(0, X.size()); - break; - case 2: - indices = sortIndices1(X_); - metrics.setData(y, indices); - computeCutPointsClassic(0, X.size()); - break; - default: - throw invalid_argument("algorithm must be 0, 1 or 2"); - } + computeCutPoints(0, X.size()); return *this; } - precision_t CPPFImdlp::halfWayValueCutPoint(size_t start, size_t idx) { - size_t idxPrev = idx - 1; - precision_t previous = X[indices[idxPrev]], actual = X[indices[idx]]; - // definition 2 of the paper => X[t-1] < X[t] - while (idxPrev-- > start && actual == previous) { - previous = X[indices[idxPrev]]; - } - return (previous + actual) / 2; - } - - tuple CPPFImdlp::completeValueCutPoint(size_t start, size_t cut, size_t end) { - size_t idxPrev = cut - 1; - bool fforward = false; - precision_t previous, actual; + pair CPPFImdlp::valueCutPoint(size_t start, size_t cut, size_t end) + { + size_t n, m, idxPrev = cut - 1 <= start ? cut - 1 : cut; + size_t idxNext = cut + 1 < end ? cut + 1 : cut; + bool backWall; // true if duplicates reach begining of the interval + precision_t previous, actual, next; previous = X[indices[idxPrev]]; actual = X[indices[cut]]; + next = X[indices[idxNext]]; // definition 2 of the paper => X[t-1] < X[t] - while (idxPrev-- > start && actual == previous) { - previous = X[indices[idxPrev]]; + // get the first equal value of X in the interval + while (idxPrev > start && actual == previous) { + previous = X[indices[--idxPrev]]; } + backWall = idxPrev == start && actual == previous; // get the last equal value of X in the interval - while (actual == X[indices[cut]] && cut + 1 < end) { - cut++; - fforward = true; + while (idxNext < end - 1 && actual == next) { + next = X[indices[++idxNext]]; } - if (fforward) - cut--; - // try to get the next value if it can't be found backwards - if (previous == actual && cut + 1 < end) - actual = X[indices[cut + 1]]; - return make_tuple((previous + actual) / 2, cut); + // # of duplicates before cutpoint + n = cut - 1 - idxPrev; + // # of duplicates after cutpoint + m = idxNext - cut - 1; + // Decide which values to use + cut = cut + (backWall ? m + 1 : -n); + actual = X[indices[cut]]; + return { (actual + previous) / 2, cut }; } - void CPPFImdlp::computeCutPoints(size_t start, size_t end) { + void CPPFImdlp::computeCutPoints(size_t start, size_t end) + { size_t cut; - tuple result; - if (end - start < 2) + pair result; + if (end - start < 3) return; cut = getCandidate(start, end); if (cut == numeric_limits::max()) return; if (mdlp(start, cut, end)) { - result = completeValueCutPoint(start, cut, end); - cut = get<1>(result); - cutPoints.push_back(get<0>(result)); + result = valueCutPoint(start, cut, end); + cut = result.second; + cutPoints.push_back(result.first); computeCutPoints(start, cut); computeCutPoints(cut, end); } } - void CPPFImdlp::computeCutPointsAlternative(size_t start, size_t end) { - size_t cut; - if (end - start < 2) - return; - cut = getCandidate(start, end); - if (cut == numeric_limits::max()) - return; - if (mdlp(start, cut, end)) { - cutPoints.push_back(halfWayValueCutPoint(start, cut)); - computeCutPointsAlternative(start, cut); - computeCutPointsAlternative(cut, end); - } - } - - void CPPFImdlp::computeCutPointsClassic(size_t start, size_t end) { - size_t cut; - cut = getCandidate(start, end); - if (cut == numeric_limits::max() || !mdlp(start, cut, end)) { - // cut.value == -1 means that there is no candidate in the interval - // No boundary found, so we add both ends of the interval as cutpoints - // because they were selected by the algorithm before - if (start == end) - return; - if (start != 0) - cutPoints.push_back((X[indices[start]] + X[indices[start - 1]]) / 2); - if (end != X.size()) - cutPoints.push_back((X[indices[end]] + X[indices[end - 1]]) / 2); - return; - } - computeCutPoints(start, cut); - computeCutPoints(cut, end); - } - - size_t CPPFImdlp::getCandidate(size_t start, size_t end) { + size_t CPPFImdlp::getCandidate(size_t start, size_t end) + { /* Definition 1: A binary discretization for A is determined by selecting the cut point TA for which E(A, TA; S) is minimal amongst all the candidate cut points. */ size_t candidate = numeric_limits::max(), elements = end - start; - bool same_values = true; + bool sameValues = true; precision_t entropy_left, entropy_right, minEntropy; + // Check if all the values of the variable in the interval are the same + for (size_t idx = start + 1; idx < end; idx++) { + if (X[indices[idx]] != X[indices[start]]) { + sameValues = false; + break; + } + } + if (sameValues) + return candidate; minEntropy = metrics.entropy(start, end); - for (auto idx = start + 1; idx < end; idx++) { - if (X[indices[idx]] != X[indices[idx - 1]]) - same_values = false; + for (size_t idx = start + 1; idx < end; idx++) { // Cutpoints are always on boundaries (definition 2) if (y[indices[idx]] == y[indices[idx - 1]]) continue; @@ -145,13 +105,11 @@ namespace mdlp { candidate = idx; } } - // If all the values of the variable in the interval are the same, it doesn't consider the cut point - if (same_values) - candidate = numeric_limits::max(); return candidate; } - bool CPPFImdlp::mdlp(size_t start, size_t cut, size_t end) { + bool CPPFImdlp::mdlp(size_t start, size_t cut, size_t end) + { int k, k1, k2; precision_t ig, delta; precision_t ent, ent1, ent2; @@ -167,37 +125,28 @@ namespace mdlp { ent2 = metrics.entropy(cut, end); ig = metrics.informationGain(start, cut, end); delta = log2(pow(3, precision_t(k)) - 2) - - (precision_t(k) * ent - precision_t(k1) * ent1 - precision_t(k2) * ent2); + (precision_t(k) * ent - precision_t(k1) * ent1 - precision_t(k2) * ent2); precision_t term = 1 / N * (log2(N - 1) + delta); return ig > term; } // Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes - indices_t CPPFImdlp::sortIndices(samples_t &X_, labels_t &y_) { + indices_t CPPFImdlp::sortIndices(samples_t& X_, labels_t& y_) + { indices_t idx(X_.size()); iota(idx.begin(), idx.end(), 0); for (size_t i = 0; i < X_.size(); i++) stable_sort(idx.begin(), idx.end(), [&X_, &y_](size_t i1, size_t i2) { - if (X_[i1] == X_[i2]) - return y_[i1] < y_[i2]; - else - return X_[i1] < X_[i2]; - }); - return idx; - } - - // Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes - indices_t CPPFImdlp::sortIndices1(samples_t &X_) { - indices_t idx(X_.size()); - iota(idx.begin(), idx.end(), 0); - for (size_t i = 0; i < X_.size(); i++) - stable_sort(idx.begin(), idx.end(), [&X_](size_t i1, size_t i2) { + if (X_[i1] == X_[i2]) + return y_[i1] < y_[i2]; + else return X_[i1] < X_[i2]; - }); + }); return idx; } - cutPoints_t CPPFImdlp::getCutPoints() { + cutPoints_t CPPFImdlp::getCutPoints() + { // Remove duplicates and sort cutPoints_t output(cutPoints.size()); set s; diff --git a/CPPFImdlp.h b/CPPFImdlp.h index 24b2877..0280cd9 100644 --- a/CPPFImdlp.h +++ b/CPPFImdlp.h @@ -3,12 +3,10 @@ #include "typesFImdlp.h" #include "Metrics.h" #include -#include #include namespace mdlp { class CPPFImdlp { protected: - int algorithm; indices_t indices; samples_t X; labels_t y; @@ -16,20 +14,16 @@ namespace mdlp { cutPoints_t cutPoints; static indices_t sortIndices(samples_t&, labels_t&); - static indices_t sortIndices1(samples_t&); void computeCutPoints(size_t, size_t); - void computeCutPointsAlternative(size_t, size_t); - void computeCutPointsClassic(size_t, size_t); bool mdlp(size_t, size_t, size_t); size_t getCandidate(size_t, size_t); - precision_t halfWayValueCutPoint(size_t, size_t); - tuple completeValueCutPoint(size_t, size_t, size_t); + pair valueCutPoint(size_t, size_t, size_t); public: - CPPFImdlp(int algorithm = 0); + CPPFImdlp(); ~CPPFImdlp(); CPPFImdlp& fit(samples_t&, labels_t&); samples_t getCutPoints(); - inline string version() { return "1.0.0"; }; + inline string version() { return "1.1.0"; }; }; } #endif \ No newline at end of file diff --git a/Metrics.cpp b/Metrics.cpp index 1275b00..766e508 100644 --- a/Metrics.cpp +++ b/Metrics.cpp @@ -29,8 +29,8 @@ namespace mdlp { labels_t counts(numClasses + 1, 0); if (end - start < 2) return 0; - if (entropyCache.find(make_tuple(start, end)) != entropyCache.end()) { - return entropyCache[make_tuple(start, end)]; + if (entropyCache.find({ start, end }) != entropyCache.end()) { + return entropyCache[{start, end}]; } for (auto i = &indices[start]; i != &indices[end]; ++i) { counts[y[*i]]++; @@ -42,7 +42,7 @@ namespace mdlp { ventropy -= p * log2(p); } } - entropyCache[make_tuple(start, end)] = ventropy; + entropyCache[{start, end}] = ventropy; return ventropy; } precision_t Metrics::informationGain(size_t start, size_t cut, size_t end) diff --git a/sample/CMakeLists.txt b/sample/CMakeLists.txt index 6fea95c..68ba4df 100644 --- a/sample/CMakeLists.txt +++ b/sample/CMakeLists.txt @@ -1,6 +1,6 @@ cmake_minimum_required(VERSION 3.20) project(main) -set(CMAKE_CXX_STANDARD 14) +set(CMAKE_CXX_STANDARD 11) add_executable(sample sample.cpp ../tests/ArffFiles.cpp ../Metrics.cpp ../CPPFImdlp.cpp) diff --git a/sample/sample.cpp b/sample/sample.cpp index 18efdb9..0797e96 100644 --- a/sample/sample.cpp +++ b/sample/sample.cpp @@ -7,18 +7,20 @@ using namespace std; using namespace mdlp; + int main(int argc, char** argv) { ArffFiles file; vector lines; string path = "../../tests/datasets/"; - map datasets = { - {"mfeat-factors", true}, - {"iris", true}, - {"letter", true}, - {"glass", true}, - {"kdd_JapaneseVowels", false}, - {"test", true} + map datasets = { + {"mfeat-factors", true}, + {"iris", true}, + {"letter", true}, + {"glass", true}, + {"kdd_JapaneseVowels", false}, + {"mfeat-factors", true}, + {"test", true} }; if (argc != 2 || datasets.find(argv[1]) == datasets.end()) { cout << "Usage: " << argv[0] << " {mfeat-factors, glass, iris, letter, kdd_JapaneseVowels, test}" << endl; @@ -44,9 +46,11 @@ int main(int argc, char** argv) } cout << y[i] << endl; } - mdlp::CPPFImdlp test = mdlp::CPPFImdlp(0); + mdlp::CPPFImdlp test = mdlp::CPPFImdlp(); for (auto i = 0; i < attributes.size(); i++) { + auto min_max = minmax_element(X[i].begin(), X[i].end()); cout << "Cut points for " << get<0>(attributes[i]) << endl; + cout << "Min: " << *min_max.first << " Max: " << *min_max.second << endl; cout << "--------------------------" << setprecision(3) << endl; test.fit(X[i], y); for (auto item : test.getCutPoints()) { diff --git a/tests/ArffFiles.cpp b/tests/ArffFiles.cpp index 7b59ef8..470f5fa 100644 --- a/tests/ArffFiles.cpp +++ b/tests/ArffFiles.cpp @@ -17,7 +17,7 @@ unsigned long int ArffFiles::getSize() { return lines.size(); } -vector> ArffFiles::getAttributes() +vector> ArffFiles::getAttributes() { return attributes; } @@ -50,7 +50,7 @@ void ArffFiles::load(string fileName, bool classLast) if (line.find("@attribute") != string::npos || line.find("@ATTRIBUTE") != string::npos) { stringstream ss(line); ss >> keyword >> attribute >> type; - attributes.push_back(make_tuple(attribute, type)); + attributes.push_back({ attribute, type }); continue; } if (line[0] == '@') { diff --git a/tests/ArffFiles.h b/tests/ArffFiles.h index 6986d3b..b56d28d 100644 --- a/tests/ArffFiles.h +++ b/tests/ArffFiles.h @@ -2,12 +2,11 @@ #define ARFFFILES_H #include #include -#include using namespace std; class ArffFiles { private: vector lines; - vector> attributes; + vector> attributes; string className, classType; vector> X; vector y; @@ -22,7 +21,7 @@ public: string trim(const string&); vector>& getX(); vector& getY(); - vector> getAttributes(); + vector> getAttributes(); vector factorize(const vector& labels_t); }; #endif \ No newline at end of file diff --git a/tests/FImdlp_unittest.cpp b/tests/FImdlp_unittest.cpp index e86a156..2e5757e 100644 --- a/tests/FImdlp_unittest.cpp +++ b/tests/FImdlp_unittest.cpp @@ -13,18 +13,13 @@ namespace mdlp { { X = { 4.7, 4.7, 4.7, 4.7, 4.8, 4.8, 4.8, 4.8, 4.9, 4.95, 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 }; y = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 }; - algorithm = false; fit(X, y); } - void setalgorithm(bool value) - { - algorithm = value; - } void checkSortedVector() { indices_t testSortedIndices = sortIndices(X, y); precision_t prev = X[testSortedIndices[0]]; - for (auto i = 0; i < X.size(); ++i) { + for (unsigned long i = 0; i < X.size(); ++i) { EXPECT_EQ(testSortedIndices[i], indices[i]); EXPECT_LE(prev, X[testSortedIndices[i]]); prev = X[testSortedIndices[i]]; @@ -34,7 +29,7 @@ namespace mdlp { { int expectedSize = expected.size(); EXPECT_EQ(cutPoints.size(), expectedSize); - for (auto i = 0; i < cutPoints.size(); i++) { + for (unsigned long i = 0; i < cutPoints.size(); i++) { EXPECT_NEAR(cutPoints[i], expected[i], precision); } } @@ -47,6 +42,19 @@ namespace mdlp { EXPECT_NEAR(expected[i], computed[i], precision); } } + bool test_result(samples_t& X_, size_t cut, float midPoint, size_t limit, string title) + { + pair result; + labels_t y_ = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; + X = X_; + y = y_; + indices = sortIndices(X, y); + cout << "* " << title << endl; + result = valueCutPoint(0, cut, 10); + EXPECT_NEAR(result.first, midPoint, precision); + EXPECT_EQ(result.second, limit); + return true; + } }; TEST_F(TestFImdlp, FitErrorEmptyDataset) { @@ -54,11 +62,6 @@ namespace mdlp { y = labels_t(); EXPECT_THROW(fit(X, y), std::invalid_argument); } - TEST_F(TestFImdlp, FitErrorIncorrectAlgorithm) - { - algorithm = 2; - EXPECT_THROW(fit(X, y), std::invalid_argument); - } TEST_F(TestFImdlp, FitErrorDifferentSize) { X = { 1, 2, 3 }; @@ -83,31 +86,41 @@ namespace mdlp { y = { 2, 2, 1 }; indices = { 1, 2, 0 }; } - TEST_F(TestFImdlp, TestArtificialDatasetAlternative) + TEST_F(TestFImdlp, TestShortDatasets) { - algorithm = 1; + vector computed; + X = { 1 }; + y = { 1 }; fit(X, y); - computeCutPoints(0, 20); - cutPoints_t expected = { 5.0500001907348633 }; - vector computed = getCutPoints(); computed = getCutPoints(); - int expectedSize = expected.size(); - EXPECT_EQ(computed.size(), expected.size()); - for (auto i = 0; i < computed.size(); i++) { - EXPECT_NEAR(computed[i], expected[i], precision); - } + EXPECT_EQ(computed.size(), 0); + X = { 1, 3 }; + y = { 1, 2 }; + fit(X, y); + computed = getCutPoints(); + EXPECT_EQ(computed.size(), 0); + X = { 2, 4 }; + y = { 1, 2 }; + fit(X, y); + computed = getCutPoints(); + EXPECT_EQ(computed.size(), 0); + X = { 1, 2, 3 }; + y = { 1, 2, 2 }; + fit(X, y); + computed = getCutPoints(); + EXPECT_EQ(computed.size(), 1); + EXPECT_NEAR(computed[0], 1.5, precision); } TEST_F(TestFImdlp, TestArtificialDataset) { - algorithm = 0; fit(X, y); computeCutPoints(0, 20); - cutPoints_t expected = { 5.0500001907348633 }; + cutPoints_t expected = { 5.05 }; vector computed = getCutPoints(); computed = getCutPoints(); int expectedSize = expected.size(); EXPECT_EQ(computed.size(), expected.size()); - for (auto i = 0; i < computed.size(); i++) { + for (unsigned long i = 0; i < computed.size(); i++) { EXPECT_NEAR(computed[i], expected[i], precision); } } @@ -116,44 +129,17 @@ namespace mdlp { ArffFiles file; string path = "../datasets/"; - file.load(path + "iris.arff", true); - int items = file.getSize(); - vector& X = file.getX(); - vector expected = { - { 5.4499998092651367, 6.25 }, - { 2.8499999046325684, 3, 3.0499999523162842, 3.3499999046325684 }, - { 2.4500000476837158, 4.75, 5.0500001907348633 }, - { 0.80000001192092896, 1.4500000476837158, 1.75 } - }; - labels_t& y = file.getY(); - auto attributes = file.getAttributes(); - algorithm = 0; - for (auto feature = 0; feature < attributes.size(); feature++) { - fit(X[feature], y); - vector computed = getCutPoints(); - EXPECT_EQ(computed.size(), expected[feature].size()); - for (auto i = 0; i < computed.size(); i++) { - EXPECT_NEAR(computed[i], expected[feature][i], precision); - } - } - } - TEST_F(TestFImdlp, TestIrisAlternative) - { - ArffFiles file; - string path = "../datasets/"; - file.load(path + "iris.arff", true); int items = file.getSize(); vector& X = file.getX(); vector expected = { { 5.4499998092651367, 5.75 }, - { 2.8499999046325684, 3.3499999046325684 }, - { 2.4500000476837158, 4.75 }, + { 2.75, 2.85, 2.95, 3.05, 3.35 }, + { 2.4500000476837158, 4.75, 5.0500001907348633 }, { 0.80000001192092896, 1.75 } }; labels_t& y = file.getY(); auto attributes = file.getAttributes(); - algorithm = 1; for (auto feature = 0; feature < attributes.size(); feature++) { fit(X[feature], y); vector computed = getCutPoints(); @@ -166,21 +152,30 @@ namespace mdlp { TEST_F(TestFImdlp, ComputeCutPointsGCase) { cutPoints_t expected; - algorithm = 0; expected = { 1.5 }; - samples_t X_ = { 0, 1, 2, 2 }; - labels_t y_ = { 1, 1, 1, 2 }; + samples_t X_ = { 0, 1, 2, 2, 2 }; + labels_t y_ = { 1, 1, 1, 2, 2 }; fit(X_, y_); checkCutPoints(expected); } - TEST_F(TestFImdlp, ComputeCutPointsAlternativeGCase) + TEST_F(TestFImdlp, CompleteValueCutPoint) { - cutPoints_t expected; - expected = { 1.5 }; - algorithm = true; - samples_t X_ = { 0, 1, 2, 2 }; - labels_t y_ = { 1, 1, 1, 2 }; - fit(X_, y_); - checkCutPoints(expected); + // Case titles as stated in the doc + samples_t X1a{ 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0 }; + test_result(X1a, 6, 7.3 / 2, 6, "1a"); + samples_t X2a = { 3.1, 3.2, 3.3, 3.4, 3.7, 3.7, 3.7, 3.8, 3.9, 4.0 }; + test_result(X2a, 6, 7.1 / 2, 4, "2a"); + samples_t X2b = { 3.7, 3.7, 3.7, 3.7, 3.7, 3.7, 3.7, 3.8, 3.9, 4.0 }; + test_result(X2b, 6, 7.5 / 2, 7, "2b"); + samples_t X3a = { 3.1, 3.2, 3.3, 3.4, 3.7, 3.7, 3.7, 3.8, 3.9, 4.0 }; + test_result(X3a, 4, 7.1 / 2, 4, "3a"); + samples_t X3b = { 3.1, 3.2, 3.3, 3.4, 3.7, 3.7, 3.7, 3.7, 3.7, 3.7 }; + test_result(X3b, 4, 7.1 / 2, 4, "3b"); + samples_t X4a = { 3.1, 3.2, 3.7, 3.7, 3.7, 3.7, 3.7, 3.7, 3.9, 4.0 }; + test_result(X4a, 4, 6.9 / 2, 2, "4a"); + samples_t X4b = { 3.7, 3.7, 3.7, 3.7, 3.7, 3.7, 3.7, 3.8, 3.9, 4.0 }; + test_result(X4b, 4, 7.5 / 2, 7, "4b"); + samples_t X4c = { 3.1, 3.2, 3.7, 3.7, 3.7, 3.7, 3.7, 3.7, 3.7, 3.7 }; + test_result(X4c, 4, 6.9 / 2, 2, "4c"); } } diff --git a/typesFImdlp.h b/typesFImdlp.h index 4a175cd..753e333 100644 --- a/typesFImdlp.h +++ b/typesFImdlp.h @@ -11,7 +11,7 @@ namespace mdlp { typedef vector labels_t; typedef vector indices_t; typedef vector cutPoints_t; - typedef map, precision_t> cacheEnt_t; + typedef map, precision_t> cacheEnt_t; typedef map, precision_t> cacheIg_t; } #endif From 5bb0e1e6caa3a3ece42520160899fed52e4b5ad0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Mon, 20 Feb 2023 18:52:14 +0100 Subject: [PATCH 6/9] Change name to test ValueCutPoint --- tests/FImdlp_unittest.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/FImdlp_unittest.cpp b/tests/FImdlp_unittest.cpp index 2e5757e..32a9a50 100644 --- a/tests/FImdlp_unittest.cpp +++ b/tests/FImdlp_unittest.cpp @@ -158,7 +158,7 @@ namespace mdlp { fit(X_, y_); checkCutPoints(expected); } - TEST_F(TestFImdlp, CompleteValueCutPoint) + TEST_F(TestFImdlp, ValueCutPoint) { // Case titles as stated in the doc samples_t X1a{ 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0 }; From 79c029832a6a4d2981053547813ba62308d51549 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Tue, 21 Feb 2023 18:49:57 +0100 Subject: [PATCH 7/9] Fix a sign mistake in valueCutPoint --- .vscode/launch.json | 2 +- CPPFImdlp.cpp | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.vscode/launch.json b/.vscode/launch.json index 4d023ae..c865479 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -10,7 +10,7 @@ "request": "launch", "program": "${workspaceRoot}/sample/build/sample", "args": [ - "test" + "mfeat-factors" ], "stopAtEntry": false, "cwd": "${workspaceRoot}/sample/build/", diff --git a/CPPFImdlp.cpp b/CPPFImdlp.cpp index 59b9b4b..1c6234f 100644 --- a/CPPFImdlp.cpp +++ b/CPPFImdlp.cpp @@ -32,10 +32,12 @@ namespace mdlp { pair CPPFImdlp::valueCutPoint(size_t start, size_t cut, size_t end) { - size_t n, m, idxPrev = cut - 1 <= start ? cut - 1 : cut; + size_t n, m, idxPrev = cut - 1 >= start ? cut - 1 : cut; size_t idxNext = cut + 1 < end ? cut + 1 : cut; bool backWall; // true if duplicates reach begining of the interval precision_t previous, actual, next; + if (cut - 1 < start || cut + 1 >= end) + throw logic_error("Invalid cutpoint index"); previous = X[indices[idxPrev]]; actual = X[indices[cut]]; next = X[indices[idxNext]]; From de25ba78bd6eab4df7b56f69fdb0d47f9afaecb4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Tue, 21 Feb 2023 18:58:54 +0100 Subject: [PATCH 8/9] Remove unused variable in sample --- sample/sample.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/sample/sample.cpp b/sample/sample.cpp index 0797e96..2edf3bd 100644 --- a/sample/sample.cpp +++ b/sample/sample.cpp @@ -11,7 +11,6 @@ using namespace mdlp; int main(int argc, char** argv) { ArffFiles file; - vector lines; string path = "../../tests/datasets/"; map datasets = { {"mfeat-factors", true}, From a44f01460af83fcc749c2d77b9f7957ff98e826b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Tue, 21 Feb 2023 18:59:45 +0100 Subject: [PATCH 9/9] Remove duplicate dataset in sample --- sample/sample.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/sample/sample.cpp b/sample/sample.cpp index 2edf3bd..02ef84b 100644 --- a/sample/sample.cpp +++ b/sample/sample.cpp @@ -18,7 +18,6 @@ int main(int argc, char** argv) {"letter", true}, {"glass", true}, {"kdd_JapaneseVowels", false}, - {"mfeat-factors", true}, {"test", true} }; if (argc != 2 || datasets.find(argv[1]) == datasets.end()) {