mlpack  2.2.5
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
increment_policy.hpp
Go to the documentation of this file.
1 
12 #ifndef MLPACK_CORE_DATA_MAP_POLICIES_INCREMENT_POLICY_HPP
13 #define MLPACK_CORE_DATA_MAP_POLICIES_INCREMENT_POLICY_HPP
14 
15 #include <mlpack/prereqs.hpp>
16 #include <unordered_map>
17 #include <boost/bimap.hpp>
19 
20 namespace mlpack {
21 namespace data {
30 {
31  public:
32  // typedef of MappedType
33  using MappedType = size_t;
34 
36  static const bool NeedsFirstPass = true;
37 
41  template<typename T>
42  void MapFirstPass(const std::string& string,
43  const size_t dim,
44  std::vector<Datatype>& types)
45  {
46  if (types[dim] == Datatype::categorical)
47  {
48  // No need to check; it's already categorical.
49  return;
50  }
51 
52  // Otherwise we need to attempt to read the value. If the read fails, the
53  // dimension is categorical; otherwise we leave it at the default of
54  // numeric.
55  std::stringstream token;
56  token.str(string);
57  T val;
58  token >> val;
59 
60  if (token.fail() || !token.eof())
61  {
62  // Parsing failed; the dimension is categorical.
63  types[dim] = Datatype::categorical;
64  }
65  }
66 
80  template<typename MapType, typename T>
81  T MapString(const std::string& string,
82  const size_t dimension,
83  MapType& maps,
84  std::vector<Datatype>& types)
85  {
86  // If we are in a categorical dimension we already know we need to map.
87  if (types[dimension] == Datatype::numeric)
88  {
89  // Check if this string needs to be mapped or if it can be read
90  // directly as a number. This will be true if nothing else in this
91  // dimension has yet been mapped, but this can't be read as a number.
92  std::stringstream token;
93  token.str(string);
94  T val;
95  token >> val;
96 
97  if (!token.fail() && token.eof())
98  {
99  // We can return what we have.
100  return val;
101  }
102  }
103 
104  // The token must be mapped.
105 
106  // If this condition is true, either we have no mapping for the given string
107  // or we have no mappings for the given dimension at all. In either case,
108  // we create a mapping.
109  if (maps.count(dimension) == 0 ||
110  maps[dimension].first.left.count(string) == 0)
111  {
112  // This string does not exist yet.
113  size_t& numMappings = maps[dimension].second;
114 
115  // Change type of the feature to categorical.
116  if (numMappings == 0)
117  types[dimension] = Datatype::categorical;
118 
119  typedef boost::bimap<std::string, MappedType>::value_type PairType;
120  maps[dimension].first.insert(PairType(string, numMappings));
121  return T(numMappings++);
122  }
123  else
124  {
125  // This string already exists in the mapping.
126  return maps[dimension].first.left.at(string);
127  }
128  }
129 
145  template <typename eT, typename MapType>
146  void MapTokens(const std::vector<std::string>& tokens,
147  size_t& row,
148  arma::Mat<eT>& matrix,
149  MapType& maps,
150  std::vector<Datatype>& types)
151  {
152  auto notNumber = [](const std::string& str)
153  {
154  eT val(0);
155  std::stringstream token;
156  token.str(str);
157  token >> val;
158  return token.fail();
159  };
160 
161  const bool notNumeric = std::any_of(std::begin(tokens),
162  std::end(tokens), notNumber);
163  if (notNumeric)
164  {
165  for (size_t i = 0; i != tokens.size(); ++i)
166  {
167  const eT val = static_cast<eT>(this->MapString(tokens[i], row, maps,
168  types));
169  matrix.at(row, i) = val;
170  }
171  }
172  else
173  {
174  std::stringstream token;
175  for (size_t i = 0; i != tokens.size(); ++i)
176  {
177  token.str(tokens[i]);
178  token >> matrix.at(row, i);
179  token.clear();
180  }
181  }
182  }
183 }; // class IncrementPolicy
184 
185 } // namespace data
186 } // namespace mlpack
187 
188 #endif
IncrementPolicy is used as a helper class for DatasetMapper.
The core includes that mlpack expects; standard C++ includes and Armadillo.
void MapTokens(const std::vector< std::string > &tokens, size_t &row, arma::Mat< eT > &matrix, MapType &maps, std::vector< Datatype > &types)
MapTokens turns vector of strings into numeric variables and puts them into a given matrix...
static const bool NeedsFirstPass
We do need a first pass over the data to set the dimension types right.
T MapString(const std::string &string, const size_t dimension, MapType &maps, std::vector< Datatype > &types)
Given the string and the dimension to which the it belongs, and the maps and types given by the Datas...
void MapFirstPass(const std::string &string, const size_t dim, std::vector< Datatype > &types)
Determine if the dimension is numeric or categorical.