mlpack  2.2.5
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
dataset_mapper.hpp
Go to the documentation of this file.
1 
15 #ifndef MLPACK_CORE_DATA_DATASET_INFO_HPP
16 #define MLPACK_CORE_DATA_DATASET_INFO_HPP
17 
18 #include <mlpack/prereqs.hpp>
19 #include <unordered_map>
20 #include <boost/bimap.hpp>
21 
23 
24 namespace mlpack {
25 namespace data {
35 template <typename PolicyType>
37 {
38  public:
44  explicit DatasetMapper(const size_t dimensionality = 0);
45 
51  explicit DatasetMapper(PolicyType& policy, const size_t dimensionality = 0);
52 
60  template<typename T>
61  void MapFirstPass(const std::string& string, const size_t dimension);
62 
73  template<typename T>
74  T MapString(const std::string& string,
75  const size_t dimension);
76 
85  const std::string& UnmapString(const size_t value, const size_t dimension);
86 
87 
96  typename PolicyType::MappedType UnmapValue(const std::string& string,
97  const size_t dimension);
98 
111  template <typename eT>
112  void MapTokens(const std::vector<std::string>& tokens, size_t& row,
113  arma::Mat<eT>& matrix);
114 
116  Datatype Type(const size_t dimension) const;
118  Datatype& Type(const size_t dimension);
119 
124  size_t NumMappings(const size_t dimension) const;
125 
132  size_t Dimensionality() const;
133 
137  template<typename Archive>
138  void Serialize(Archive& ar, const unsigned int /* version */)
139  {
140  ar & data::CreateNVP(types, "types");
141  ar & data::CreateNVP(maps, "maps");
142  }
143 
145  const PolicyType& Policy() const;
146 
148  PolicyType& Policy();
150  void Policy(PolicyType&& policy);
151 
152  private:
154  std::vector<Datatype> types;
155 
156  // BiMapType definition
157  using BiMapType = boost::bimap<std::string, typename PolicyType::MappedType>;
158 
159  // Mappings from strings to integers.
160  // Map entries will only exist for dimensions that are categorical.
161  // MapType = map<dimension, pair<bimap<string, MappedType>, numMappings>>
162  using MapType = std::unordered_map<size_t, std::pair<BiMapType, size_t>>;
163 
165  MapType maps;
166 
168  // mapped to the maps object. It is used in MapString() and MapTokens().
169  PolicyType policy;
170 };
171 
172 // Use typedef to provide backward compatibility
174 
175 } // namespace data
176 } // namespace mlpack
177 
178 #include "dataset_mapper_impl.hpp"
179 
180 #endif
Auxiliary information for a dataset, including mappings to/from strings and the datatype of each dime...
DatasetMapper(const size_t dimensionality=0)
Create the DatasetMapper object with the given dimensionality.
T MapString(const std::string &string, const size_t dimension)
Given the string and the dimension to which it belongs, return its numeric mapping.
FirstShim< T > CreateNVP(T &t, const std::string &name, typename boost::enable_if< HasSerialize< T >>::type *=0)
Call this function to produce a name-value pair; this is similar to BOOST_SERIALIZATION_NVP(), but should be used for types that have a Serialize() function (or contain a type that has a Serialize() function) instead of a serialize() function.
Datatype Type(const size_t dimension) const
Return the type of a given dimension (numeric or categorical).
void MapFirstPass(const std::string &string, const size_t dimension)
Preprocessing: during a first pass of the data, pass the strings on to the MapPolicy if they are need...
The core includes that mlpack expects; standard C++ includes and Armadillo.
size_t NumMappings(const size_t dimension) const
Get the number of mappings for a particular dimension.
const std::string & UnmapString(const size_t value, const size_t dimension)
Return the string that corresponds to a given value in a given dimension.
PolicyType::MappedType UnmapValue(const std::string &string, const size_t dimension)
Return the value that corresponds to a given string in a given dimension.
const PolicyType & Policy() const
Return the policy of the mapper.
size_t Dimensionality() const
Get the dimensionality of the DatasetMapper object (that is, how many dimensions it has information f...
Datatype
The Datatype enum specifies the types of data mlpack algorithms can use.
Definition: datatype.hpp:24
void Serialize(Archive &ar, const unsigned int)
Serialize the dataset information.
void MapTokens(const std::vector< std::string > &tokens, size_t &row, arma::Mat< eT > &matrix)
MapTokens turns vector of strings into numeric variables and puts them into a given matrix...