15 #ifndef MLPACK_CORE_DATA_DATASET_INFO_HPP
16 #define MLPACK_CORE_DATA_DATASET_INFO_HPP
19 #include <unordered_map>
20 #include <boost/bimap.hpp>
35 template <
typename PolicyType>
51 explicit DatasetMapper(PolicyType& policy,
const size_t dimensionality = 0);
61 void MapFirstPass(
const std::string&
string,
const size_t dimension);
75 const size_t dimension);
85 const std::string&
UnmapString(
const size_t value,
const size_t dimension);
96 typename PolicyType::MappedType
UnmapValue(
const std::string&
string,
97 const size_t dimension);
111 template <
typename eT>
112 void MapTokens(
const std::vector<std::string>& tokens,
size_t& row,
113 arma::Mat<eT>& matrix);
137 template<
typename Archive>
145 const PolicyType&
Policy()
const;
150 void Policy(PolicyType&& policy);
154 std::vector<Datatype> types;
157 using BiMapType = boost::bimap<std::string, typename PolicyType::MappedType>;
162 using MapType = std::unordered_map<size_t, std::pair<BiMapType, size_t>>;
178 #include "dataset_mapper_impl.hpp"
Auxiliary information for a dataset, including mappings to/from strings and the datatype of each dime...
DatasetMapper(const size_t dimensionality=0)
Create the DatasetMapper object with the given dimensionality.
T MapString(const std::string &string, const size_t dimension)
Given the string and the dimension to which it belongs, return its numeric mapping.
FirstShim< T > CreateNVP(T &t, const std::string &name, typename boost::enable_if< HasSerialize< T >>::type *=0)
Call this function to produce a name-value pair; this is similar to BOOST_SERIALIZATION_NVP(), but should be used for types that have a Serialize() function (or contain a type that has a Serialize() function) instead of a serialize() function.
Datatype Type(const size_t dimension) const
Return the type of a given dimension (numeric or categorical).
void MapFirstPass(const std::string &string, const size_t dimension)
Preprocessing: during a first pass of the data, pass the strings on to the MapPolicy if they are need...
The core includes that mlpack expects; standard C++ includes and Armadillo.
size_t NumMappings(const size_t dimension) const
Get the number of mappings for a particular dimension.
const std::string & UnmapString(const size_t value, const size_t dimension)
Return the string that corresponds to a given value in a given dimension.
PolicyType::MappedType UnmapValue(const std::string &string, const size_t dimension)
Return the value that corresponds to a given string in a given dimension.
const PolicyType & Policy() const
Return the policy of the mapper.
size_t Dimensionality() const
Get the dimensionality of the DatasetMapper object (that is, how many dimensions it has information f...
Datatype
The Datatype enum specifies the types of data mlpack algorithms can use.
void Serialize(Archive &ar, const unsigned int)
Serialize the dataset information.
void MapTokens(const std::vector< std::string > &tokens, size_t &row, arma::Mat< eT > &matrix)
MapTokens turns vector of strings into numeric variables and puts them into a given matrix...