LoadArff.m 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115
  1. % LoadArff.m
  2. %
  3. % Thi funciton loads data from an ARFF file and returns the data, metadata,
  4. % attributes, relation and comments. All returned strings are lower case.
  5. %
  6. % input:
  7. % arffFile - path to ARFF file to read
  8. %
  9. % output:
  10. % data - data stored in the ARFF file
  11. % metadata - structure holding metadta in the form: metadata.{width_px, height_px, width_mm, height_mm, distance_mm} -1 if not available. Extra metadata are stored in metadata.extra, which is an nx2 cell array holding name-value pairs
  12. % attributes - nx2 cell array with attribute names and types, where n is the number of attributes
  13. % relation - relation described in ARFF
  14. % comments - nx1 cell array containing one comment line per cell
  15. function [data, metadata, attributes, relation, comments] = LoadArff(arffFile)
  16. % initialize data
  17. data = [];
  18. % initialize metadata
  19. metadata.width_px = -1;
  20. metadata.height_px = -1;
  21. metadata.width_mm = -1;
  22. metadata.height_mm = -1;
  23. metadata.distance_mm = -1;
  24. metadata.extra = {};
  25. attributes = {};
  26. relation = '';
  27. comments = {};
  28. % nominal attribute handling
  29. nomMat = logical([]);
  30. nomMaps = {};
  31. % read header
  32. numOfHeaderLines = 1;
  33. fid = fopen(arffFile, 'r');
  34. fline = fgetl(fid);
  35. while (ischar(fline))
  36. % split lines into words
  37. words = strsplit(fline,' ');
  38. % check for relation
  39. if (size(words,2)>1 && strcmpi(words{1,1},'@relation')==1)
  40. relation = lower(words{1,2});
  41. % check for width_px
  42. elseif (size(words,2)>2 && strcmpi(words{1,1},'%@metadata')==1 && strcmpi(words{1,2},'width_px')==1)
  43. metadata.width_px = str2num(words{1,3});
  44. % check for height_px
  45. elseif (size(words,2)>2 && strcmpi(words{1,1},'%@metadata')==1 && strcmpi(words{1,2},'height_px')==1)
  46. metadata.height_px = str2num(words{1,3});
  47. % check for width_mm
  48. elseif (size(words,2)>2 && strcmpi(words{1,1},'%@metadata')==1 && strcmpi(words{1,2},'width_mm')==1)
  49. metadata.width_mm = str2num(words{1,3});
  50. % check for height_mm
  51. elseif (size(words,2)>2 && strcmpi(words{1,1},'%@metadata')==1 && strcmpi(words{1,2},'height_mm')==1)
  52. metadata.height_mm = str2num(words{1,3});
  53. % check for distance_mm
  54. elseif (size(words,2)>2 && strcmpi(words{1,1},'%@metadata')==1 && strcmpi(words{1,2},'distance_mm')==1)
  55. metadata.distance_mm = str2num(words{1,3});
  56. % process the rest of the metadata
  57. elseif (size(words,2)>2 && strcmpi(words{1,1},'%@metadata')==1)
  58. pos = size(metadata.extra,1)+1;
  59. metadata.extra{pos,1} = words{1,2};
  60. metadata.extra{pos,2} = words{1,3};
  61. % check for attributes
  62. elseif (size(words,2)>2 && strcmpi(words{1,1},'@attribute')==1)
  63. index = size(attributes,1)+1;
  64. attributes{index,1} = lower(words{1,2});
  65. attributes{index,2} = words{1,3};
  66. [isNom, nominalMap] = IsNomAttribute(fline);
  67. nomMat = [nomMat; isNom];
  68. if (isNom)
  69. nomMaps = [nomMaps; {nominalMap}];
  70. attributes{index,2} = GetNomAttValue(fline);
  71. else
  72. nomMaps = [nomMaps; {[]}];
  73. end
  74. % check if it is a comment
  75. elseif (length(fline>0) && fline(1) == '%')
  76. comments{end+1} = fline;
  77. % check if data has been reached
  78. elseif (size(words,2)>0 && strcmpi(words{1,1},'@data')==1)
  79. break;
  80. end
  81. fline = fgetl(fid);
  82. numOfHeaderLines = numOfHeaderLines+1;
  83. end
  84. numAtts = size(attributes,1);
  85. readFormat = '';
  86. for ind=1:numAtts
  87. if (nomMat(ind))
  88. readFormat = [readFormat '%s '];
  89. else
  90. readFormat = [readFormat '%f '];
  91. end
  92. end
  93. lines = textscan(fid, readFormat, 'Delimiter', ',');
  94. nomIndices = find(nomMat);
  95. for nomInd=nomIndices'
  96. if (isempty(nomInd))
  97. break;
  98. end
  99. for ind=1:size(lines{1,nomInd},1)
  100. lines{1,nomInd}{ind} = nomMaps{nomInd,1}(lines{1,nomInd}{ind});
  101. end
  102. lines{1,nomInd} = cell2mat(lines{1,nomInd});
  103. end
  104. data = cell2mat(lines);
  105. fclose(fid);
  106. end