svm-scale.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380
  1. #include <float.h>
  2. #include <stdio.h>
  3. #include <stdlib.h>
  4. #include <ctype.h>
  5. #include <string.h>
  6. void exit_with_help() {
  7. printf(
  8. "Usage: svm-scale [options] data_filename\n"
  9. "options:\n"
  10. "-l lower : x scaling lower limit (default -1)\n"
  11. "-u upper : x scaling upper limit (default +1)\n"
  12. "-y y_lower y_upper : y scaling limits (default: no y scaling)\n"
  13. "-s save_filename : save scaling parameters to save_filename\n"
  14. "-r restore_filename : restore scaling parameters from restore_filename\n"
  15. );
  16. exit(1);
  17. }
  18. char *line = NULL;
  19. int max_line_len = 1024;
  20. double lower = -1.0, upper = 1.0, y_lower, y_upper;
  21. int y_scaling = 0;
  22. double *feature_max;
  23. double *feature_min;
  24. double y_max = -DBL_MAX;
  25. double y_min = DBL_MAX;
  26. int max_index;
  27. int min_index;
  28. long int num_nonzeros = 0;
  29. long int new_num_nonzeros = 0;
  30. #define max(x, y) (((x)>(y))?(x):(y))
  31. #define min(x, y) (((x)<(y))?(x):(y))
  32. void output_target(double value);
  33. void output(int index, double value);
  34. char *readline(FILE *input);
  35. int clean_up(FILE *fp_restore, FILE *fp, const char *msg);
  36. int main(int argc, char **argv) {
  37. int i, index;
  38. FILE *fp, *fp_restore = NULL;
  39. char *save_filename = NULL;
  40. char *restore_filename = NULL;
  41. for (i = 1; i < argc; i++) {
  42. if (argv[i][0] != '-') break;
  43. ++i;
  44. switch (argv[i - 1][1]) {
  45. case 'l':
  46. lower = atof(argv[i]);
  47. break;
  48. case 'u':
  49. upper = atof(argv[i]);
  50. break;
  51. case 'y':
  52. y_lower = atof(argv[i]);
  53. ++i;
  54. y_upper = atof(argv[i]);
  55. y_scaling = 1;
  56. break;
  57. case 's':
  58. save_filename = argv[i];
  59. break;
  60. case 'r':
  61. restore_filename = argv[i];
  62. break;
  63. default:
  64. fprintf(stderr, "unknown option\n");
  65. exit_with_help();
  66. }
  67. }
  68. if (!(upper > lower) || (y_scaling && !(y_upper > y_lower))) {
  69. fprintf(stderr, "inconsistent lower/upper specification\n");
  70. exit(1);
  71. }
  72. if (restore_filename && save_filename) {
  73. fprintf(stderr, "cannot use -r and -s simultaneously\n");
  74. exit(1);
  75. }
  76. if (argc != i + 1)
  77. exit_with_help();
  78. fp = fopen(argv[i], "r");
  79. if (fp == NULL) {
  80. fprintf(stderr, "can't open file %s\n", argv[i]);
  81. exit(1);
  82. }
  83. line = (char *) malloc(max_line_len * sizeof(char));
  84. #define SKIP_TARGET\
  85. while(isspace(*p)) ++p;\
  86. while(!isspace(*p)) ++p;
  87. #define SKIP_ELEMENT\
  88. while(*p!=':') ++p;\
  89. ++p;\
  90. while(isspace(*p)) ++p;\
  91. while(*p && !isspace(*p)) ++p;
  92. /* assumption: min index of attributes is 1 */
  93. /* pass 1: find out max index of attributes */
  94. max_index = 0;
  95. min_index = 1;
  96. if (restore_filename) {
  97. int idx, c;
  98. fp_restore = fopen(restore_filename, "r");
  99. if (fp_restore == NULL) {
  100. fprintf(stderr, "can't open file %s\n", restore_filename);
  101. exit(1);
  102. }
  103. c = fgetc(fp_restore);
  104. if (c == 'y') {
  105. readline(fp_restore);
  106. readline(fp_restore);
  107. readline(fp_restore);
  108. }
  109. readline(fp_restore);
  110. readline(fp_restore);
  111. while (fscanf(fp_restore, "%d %*f %*f\n", &idx) == 1)
  112. max_index = max(idx, max_index);
  113. rewind(fp_restore);
  114. }
  115. while (readline(fp) != NULL) {
  116. char *p = line;
  117. SKIP_TARGET
  118. while (sscanf(p, "%d:%*f", &index) == 1) {
  119. max_index = max(max_index, index);
  120. min_index = min(min_index, index);
  121. SKIP_ELEMENT
  122. num_nonzeros++;
  123. }
  124. }
  125. if (min_index < 1)
  126. fprintf(stderr,
  127. "WARNING: minimal feature index is %d, but indices should start from 1\n", min_index);
  128. rewind(fp);
  129. feature_max = (double *) malloc((max_index + 1) * sizeof(double));
  130. feature_min = (double *) malloc((max_index + 1) * sizeof(double));
  131. if (feature_max == NULL || feature_min == NULL) {
  132. fprintf(stderr, "can't allocate enough memory\n");
  133. exit(1);
  134. }
  135. for (i = 0; i <= max_index; i++) {
  136. feature_max[i] = -DBL_MAX;
  137. feature_min[i] = DBL_MAX;
  138. }
  139. /* pass 2: find out min/max value */
  140. while (readline(fp) != NULL) {
  141. char *p = line;
  142. int next_index = 1;
  143. double target;
  144. double value;
  145. if (sscanf(p, "%lf", &target) != 1)
  146. return clean_up(fp_restore, fp, "ERROR: failed to read labels\n");
  147. y_max = max(y_max, target);
  148. y_min = min(y_min, target);
  149. SKIP_TARGET
  150. while (sscanf(p, "%d:%lf", &index, &value) == 2) {
  151. for (i = next_index; i < index; i++) {
  152. feature_max[i] = max(feature_max[i], 0);
  153. feature_min[i] = min(feature_min[i], 0);
  154. }
  155. feature_max[index] = max(feature_max[index], value);
  156. feature_min[index] = min(feature_min[index], value);
  157. SKIP_ELEMENT
  158. next_index = index + 1;
  159. }
  160. for (i = next_index; i <= max_index; i++) {
  161. feature_max[i] = max(feature_max[i], 0);
  162. feature_min[i] = min(feature_min[i], 0);
  163. }
  164. }
  165. rewind(fp);
  166. /* pass 2.5: save/restore feature_min/feature_max */
  167. if (restore_filename) {
  168. /* fp_restore rewinded in finding max_index */
  169. int idx, c;
  170. double fmin, fmax;
  171. int next_index = 1;
  172. if ((c = fgetc(fp_restore)) == 'y') {
  173. if (fscanf(fp_restore, "%lf %lf\n", &y_lower, &y_upper) != 2 ||
  174. fscanf(fp_restore, "%lf %lf\n", &y_min, &y_max) != 2)
  175. return clean_up(fp_restore, fp, "ERROR: failed to read scaling parameters\n");
  176. y_scaling = 1;
  177. } else
  178. ungetc(c, fp_restore);
  179. if (fgetc(fp_restore) == 'x') {
  180. if (fscanf(fp_restore, "%lf %lf\n", &lower, &upper) != 2)
  181. return clean_up(fp_restore, fp, "ERROR: failed to read scaling parameters\n");
  182. while (fscanf(fp_restore, "%d %lf %lf\n", &idx, &fmin, &fmax) == 3) {
  183. for (i = next_index; i < idx; i++)
  184. if (feature_min[i] != feature_max[i]) {
  185. fprintf(stderr,
  186. "WARNING: feature index %d appeared in file %s was not seen in the scaling factor file %s. The feature is scaled to 0.\n",
  187. i, argv[argc - 1], restore_filename);
  188. feature_min[i] = 0;
  189. feature_max[i] = 0;
  190. }
  191. feature_min[idx] = fmin;
  192. feature_max[idx] = fmax;
  193. next_index = idx + 1;
  194. }
  195. for (i = next_index; i <= max_index; i++)
  196. if (feature_min[i] != feature_max[i]) {
  197. fprintf(stderr,
  198. "WARNING: feature index %d appeared in file %s was not seen in the scaling factor file %s. The feature is scaled to 0.\n",
  199. i, argv[argc - 1], restore_filename);
  200. feature_min[i] = 0;
  201. feature_max[i] = 0;
  202. }
  203. }
  204. fclose(fp_restore);
  205. }
  206. if (save_filename) {
  207. FILE *fp_save = fopen(save_filename, "w");
  208. if (fp_save == NULL) {
  209. fprintf(stderr, "can't open file %s\n", save_filename);
  210. exit(1);
  211. }
  212. if (y_scaling) {
  213. fprintf(fp_save, "y\n");
  214. fprintf(fp_save, "%.17g %.17g\n", y_lower, y_upper);
  215. fprintf(fp_save, "%.17g %.17g\n", y_min, y_max);
  216. }
  217. fprintf(fp_save, "x\n");
  218. fprintf(fp_save, "%.17g %.17g\n", lower, upper);
  219. for (i = 1; i <= max_index; i++) {
  220. if (feature_min[i] != feature_max[i])
  221. fprintf(fp_save, "%d %.17g %.17g\n", i, feature_min[i], feature_max[i]);
  222. }
  223. if (min_index < 1)
  224. fprintf(stderr,
  225. "WARNING: scaling factors with indices smaller than 1 are not stored to the file %s.\n",
  226. save_filename);
  227. fclose(fp_save);
  228. }
  229. /* pass 3: scale */
  230. while (readline(fp) != NULL) {
  231. char *p = line;
  232. int next_index = 1;
  233. double target;
  234. double value;
  235. if (sscanf(p, "%lf", &target) != 1)
  236. return clean_up(NULL, fp, "ERROR: failed to read labels\n");
  237. output_target(target);
  238. SKIP_TARGET
  239. while (sscanf(p, "%d:%lf", &index, &value) == 2) {
  240. for (i = next_index; i < index; i++)
  241. output(i, 0);
  242. output(index, value);
  243. SKIP_ELEMENT
  244. next_index = index + 1;
  245. }
  246. for (i = next_index; i <= max_index; i++)
  247. output(i, 0);
  248. printf("\n");
  249. }
  250. if (new_num_nonzeros > num_nonzeros)
  251. fprintf(stderr,
  252. "WARNING: original #nonzeros %ld\n"
  253. " > new #nonzeros %ld\n"
  254. "If feature values are non-negative and sparse, use -l 0 rather than the default -l -1\n",
  255. num_nonzeros, new_num_nonzeros);
  256. free(line);
  257. free(feature_max);
  258. free(feature_min);
  259. fclose(fp);
  260. return 0;
  261. }
  262. char *readline(FILE *input) {
  263. int len;
  264. if (fgets(line, max_line_len, input) == NULL)
  265. return NULL;
  266. while (strrchr(line, '\n') == NULL) {
  267. max_line_len *= 2;
  268. line = (char *) realloc(line, max_line_len);
  269. len = (int) strlen(line);
  270. if (fgets(line + len, max_line_len - len, input) == NULL)
  271. break;
  272. }
  273. return line;
  274. }
  275. void output_target(double value) {
  276. if (y_scaling) {
  277. if (value == y_min)
  278. value = y_lower;
  279. else if (value == y_max)
  280. value = y_upper;
  281. else
  282. value = y_lower + (y_upper - y_lower) *
  283. (value - y_min) / (y_max - y_min);
  284. }
  285. printf("%.17g ", value);
  286. }
  287. void output(int index, double value) {
  288. /* skip single-valued attribute */
  289. if (feature_max[index] == feature_min[index])
  290. return;
  291. if (value == feature_min[index])
  292. value = lower;
  293. else if (value == feature_max[index])
  294. value = upper;
  295. else
  296. value = lower + (upper - lower) *
  297. (value - feature_min[index]) /
  298. (feature_max[index] - feature_min[index]);
  299. if (value != 0) {
  300. printf("%d:%g ", index, value);
  301. new_num_nonzeros++;
  302. }
  303. }
  304. int clean_up(FILE *fp_restore, FILE *fp, const char *msg) {
  305. fprintf(stderr, "%s", msg);
  306. free(line);
  307. free(feature_max);
  308. free(feature_min);
  309. fclose(fp);
  310. if (fp_restore)
  311. fclose(fp_restore);
  312. return -1;
  313. }