You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

svm-scale.c 8.5 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397
  1. #include <float.h>
  2. #include <stdio.h>
  3. #include <stdlib.h>
  4. #include <ctype.h>
  5. #include <string.h>
  6. void exit_with_help()
  7. {
  8. printf(
  9. "Usage: svm-scale [options] data_filename\n"
  10. "options:\n"
  11. "-l lower : x scaling lower limit (default -1)\n"
  12. "-u upper : x scaling upper limit (default +1)\n"
  13. "-y y_lower y_upper : y scaling limits (default: no y scaling)\n"
  14. "-s save_filename : save scaling parameters to save_filename\n"
  15. "-r restore_filename : restore scaling parameters from restore_filename\n"
  16. );
  17. exit(1);
  18. }
  19. char *line = NULL;
  20. int max_line_len = 1024;
  21. double lower=-1.0,upper=1.0,y_lower,y_upper;
  22. int y_scaling = 0;
  23. double *feature_max;
  24. double *feature_min;
  25. double y_max = -DBL_MAX;
  26. double y_min = DBL_MAX;
  27. int max_index;
  28. int min_index;
  29. long int num_nonzeros = 0;
  30. long int new_num_nonzeros = 0;
  31. #define max(x,y) (((x)>(y))?(x):(y))
  32. #define min(x,y) (((x)<(y))?(x):(y))
  33. void output_target(double value);
  34. void output(int index, double value);
  35. char* readline(FILE *input);
  36. int clean_up(FILE *fp_restore, FILE *fp, const char *msg);
  37. int main(int argc,char **argv)
  38. {
  39. int i,index;
  40. FILE *fp, *fp_restore = NULL;
  41. char *save_filename = NULL;
  42. char *restore_filename = NULL;
  43. for(i=1;i<argc;i++)
  44. {
  45. if(argv[i][0] != '-') break;
  46. ++i;
  47. switch(argv[i-1][1])
  48. {
  49. case 'l': lower = atof(argv[i]); break;
  50. case 'u': upper = atof(argv[i]); break;
  51. case 'y':
  52. y_lower = atof(argv[i]);
  53. ++i;
  54. y_upper = atof(argv[i]);
  55. y_scaling = 1;
  56. break;
  57. case 's': save_filename = argv[i]; break;
  58. case 'r': restore_filename = argv[i]; break;
  59. default:
  60. fprintf(stderr,"unknown option\n");
  61. exit_with_help();
  62. }
  63. }
  64. if(!(upper > lower) || (y_scaling && !(y_upper > y_lower)))
  65. {
  66. fprintf(stderr,"inconsistent lower/upper specification\n");
  67. exit(1);
  68. }
  69. if(restore_filename && save_filename)
  70. {
  71. fprintf(stderr,"cannot use -r and -s simultaneously\n");
  72. exit(1);
  73. }
  74. if(argc != i+1)
  75. exit_with_help();
  76. fp=fopen(argv[i],"r");
  77. if(fp==NULL)
  78. {
  79. fprintf(stderr,"can't open file %s\n", argv[i]);
  80. exit(1);
  81. }
  82. line = (char *) malloc(max_line_len*sizeof(char));
  83. #define SKIP_TARGET\
  84. while(isspace(*p)) ++p;\
  85. while(!isspace(*p)) ++p;
  86. #define SKIP_ELEMENT\
  87. while(*p!=':') ++p;\
  88. ++p;\
  89. while(isspace(*p)) ++p;\
  90. while(*p && !isspace(*p)) ++p;
  91. /* assumption: min index of attributes is 1 */
  92. /* pass 1: find out max index of attributes */
  93. max_index = 0;
  94. min_index = 1;
  95. if(restore_filename)
  96. {
  97. int idx, c;
  98. fp_restore = fopen(restore_filename,"r");
  99. if(fp_restore==NULL)
  100. {
  101. fprintf(stderr,"can't open file %s\n", restore_filename);
  102. exit(1);
  103. }
  104. c = fgetc(fp_restore);
  105. if(c == 'y')
  106. {
  107. readline(fp_restore);
  108. readline(fp_restore);
  109. readline(fp_restore);
  110. }
  111. readline(fp_restore);
  112. readline(fp_restore);
  113. while(fscanf(fp_restore,"%d %*f %*f\n",&idx) == 1)
  114. max_index = max(idx,max_index);
  115. rewind(fp_restore);
  116. }
  117. while(readline(fp)!=NULL)
  118. {
  119. char *p=line;
  120. SKIP_TARGET
  121. while(sscanf(p,"%d:%*f",&index)==1)
  122. {
  123. max_index = max(max_index, index);
  124. min_index = min(min_index, index);
  125. SKIP_ELEMENT
  126. num_nonzeros++;
  127. }
  128. }
  129. if(min_index < 1)
  130. fprintf(stderr,
  131. "WARNING: minimal feature index is %d, but indices should start from 1\n", min_index);
  132. rewind(fp);
  133. feature_max = (double *)malloc((max_index+1)* sizeof(double));
  134. feature_min = (double *)malloc((max_index+1)* sizeof(double));
  135. if(feature_max == NULL || feature_min == NULL)
  136. {
  137. fprintf(stderr,"can't allocate enough memory\n");
  138. exit(1);
  139. }
  140. for(i=0;i<=max_index;i++)
  141. {
  142. feature_max[i]=-DBL_MAX;
  143. feature_min[i]=DBL_MAX;
  144. }
  145. /* pass 2: find out min/max value */
  146. while(readline(fp)!=NULL)
  147. {
  148. char *p=line;
  149. int next_index=1;
  150. double target;
  151. double value;
  152. if (sscanf(p,"%lf",&target) != 1)
  153. return clean_up(fp_restore, fp, "ERROR: failed to read labels\n");
  154. y_max = max(y_max,target);
  155. y_min = min(y_min,target);
  156. SKIP_TARGET
  157. while(sscanf(p,"%d:%lf",&index,&value)==2)
  158. {
  159. for(i=next_index;i<index;i++)
  160. {
  161. feature_max[i]=max(feature_max[i],0);
  162. feature_min[i]=min(feature_min[i],0);
  163. }
  164. feature_max[index]=max(feature_max[index],value);
  165. feature_min[index]=min(feature_min[index],value);
  166. SKIP_ELEMENT
  167. next_index=index+1;
  168. }
  169. for(i=next_index;i<=max_index;i++)
  170. {
  171. feature_max[i]=max(feature_max[i],0);
  172. feature_min[i]=min(feature_min[i],0);
  173. }
  174. }
  175. rewind(fp);
  176. /* pass 2.5: save/restore feature_min/feature_max */
  177. if(restore_filename)
  178. {
  179. /* fp_restore rewinded in finding max_index */
  180. int idx, c;
  181. double fmin, fmax;
  182. int next_index = 1;
  183. if((c = fgetc(fp_restore)) == 'y')
  184. {
  185. if(fscanf(fp_restore, "%lf %lf\n", &y_lower, &y_upper) != 2 ||
  186. fscanf(fp_restore, "%lf %lf\n", &y_min, &y_max) != 2)
  187. return clean_up(fp_restore, fp, "ERROR: failed to read scaling parameters\n");
  188. y_scaling = 1;
  189. }
  190. else
  191. ungetc(c, fp_restore);
  192. if (fgetc(fp_restore) == 'x')
  193. {
  194. if(fscanf(fp_restore, "%lf %lf\n", &lower, &upper) != 2)
  195. return clean_up(fp_restore, fp, "ERROR: failed to read scaling parameters\n");
  196. while(fscanf(fp_restore,"%d %lf %lf\n",&idx,&fmin,&fmax)==3)
  197. {
  198. for(i = next_index;i<idx;i++)
  199. if(feature_min[i] != feature_max[i])
  200. fprintf(stderr,
  201. "WARNING: feature index %d appeared in file %s was not seen in the scaling factor file %s.\n",
  202. i, argv[argc-1], restore_filename);
  203. feature_min[idx] = fmin;
  204. feature_max[idx] = fmax;
  205. next_index = idx + 1;
  206. }
  207. for(i=next_index;i<=max_index;i++)
  208. if(feature_min[i] != feature_max[i])
  209. fprintf(stderr,
  210. "WARNING: feature index %d appeared in file %s was not seen in the scaling factor file %s.\n",
  211. i, argv[argc-1], restore_filename);
  212. }
  213. fclose(fp_restore);
  214. }
  215. if(save_filename)
  216. {
  217. FILE *fp_save = fopen(save_filename,"w");
  218. if(fp_save==NULL)
  219. {
  220. fprintf(stderr,"can't open file %s\n", save_filename);
  221. exit(1);
  222. }
  223. if(y_scaling)
  224. {
  225. fprintf(fp_save, "y\n");
  226. fprintf(fp_save, "%.16g %.16g\n", y_lower, y_upper);
  227. fprintf(fp_save, "%.16g %.16g\n", y_min, y_max);
  228. }
  229. fprintf(fp_save, "x\n");
  230. fprintf(fp_save, "%.16g %.16g\n", lower, upper);
  231. for(i=1;i<=max_index;i++)
  232. {
  233. if(feature_min[i]!=feature_max[i])
  234. fprintf(fp_save,"%d %.16g %.16g\n",i,feature_min[i],feature_max[i]);
  235. }
  236. if(min_index < 1)
  237. fprintf(stderr,
  238. "WARNING: scaling factors with indices smaller than 1 are not stored to the file %s.\n", save_filename);
  239. fclose(fp_save);
  240. }
  241. /* pass 3: scale */
  242. while(readline(fp)!=NULL)
  243. {
  244. char *p=line;
  245. int next_index=1;
  246. double target;
  247. double value;
  248. if (sscanf(p,"%lf",&target) != 1)
  249. return clean_up(NULL, fp, "ERROR: failed to read labels\n");
  250. output_target(target);
  251. SKIP_TARGET
  252. while(sscanf(p,"%d:%lf",&index,&value)==2)
  253. {
  254. for(i=next_index;i<index;i++)
  255. output(i,0);
  256. output(index,value);
  257. SKIP_ELEMENT
  258. next_index=index+1;
  259. }
  260. for(i=next_index;i<=max_index;i++)
  261. output(i,0);
  262. printf("\n");
  263. }
  264. if (new_num_nonzeros > num_nonzeros)
  265. fprintf(stderr,
  266. "WARNING: original #nonzeros %ld\n"
  267. " > new #nonzeros %ld\n"
  268. "If feature values are non-negative and sparse, use -l 0 rather than the default -l -1\n",
  269. num_nonzeros, new_num_nonzeros);
  270. free(line);
  271. free(feature_max);
  272. free(feature_min);
  273. fclose(fp);
  274. return 0;
  275. }
  276. char* readline(FILE *input)
  277. {
  278. int len;
  279. if(fgets(line,max_line_len,input) == NULL)
  280. return NULL;
  281. while(strrchr(line,'\n') == NULL)
  282. {
  283. max_line_len *= 2;
  284. line = (char *) realloc(line, max_line_len);
  285. len = (int) strlen(line);
  286. if(fgets(line+len,max_line_len-len,input) == NULL)
  287. break;
  288. }
  289. return line;
  290. }
  291. void output_target(double value)
  292. {
  293. if(y_scaling)
  294. {
  295. if(value == y_min)
  296. value = y_lower;
  297. else if(value == y_max)
  298. value = y_upper;
  299. else value = y_lower + (y_upper-y_lower) *
  300. (value - y_min)/(y_max-y_min);
  301. }
  302. printf("%g ",value);
  303. }
  304. void output(int index, double value)
  305. {
  306. /* skip single-valued attribute */
  307. if(feature_max[index] == feature_min[index])
  308. return;
  309. if(value == feature_min[index])
  310. value = lower;
  311. else if(value == feature_max[index])
  312. value = upper;
  313. else
  314. value = lower + (upper-lower) *
  315. (value-feature_min[index])/
  316. (feature_max[index]-feature_min[index]);
  317. if(value != 0)
  318. {
  319. printf("%d:%g ",index, value);
  320. new_num_nonzeros++;
  321. }
  322. }
  323. int clean_up(FILE *fp_restore, FILE *fp, const char* msg)
  324. {
  325. fprintf(stderr, "%s", msg);
  326. free(line);
  327. free(feature_max);
  328. free(feature_min);
  329. fclose(fp);
  330. if (fp_restore)
  331. fclose(fp_restore);
  332. return -1;
  333. }

A Python package for graph kernels, graph edit distances and graph pre-image problem.