Skip to content

Commit

Permalink
[feat] add kmeans
Browse files Browse the repository at this point in the history
  • Loading branch information
omoghaoghenemano committed Sep 17, 2024
1 parent 66f6a1b commit 2d82d91
Show file tree
Hide file tree
Showing 5 changed files with 246 additions and 0 deletions.
6 changes: 6 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,19 +12,25 @@ include_directories(include)
add_library(InfernoML STATIC
src/algorithms/linear_regression.cpp
)
add_library(Kmeans STATIC
src/algorithms/kmeans.cpp
)

add_library(Activations STATIC
src/activation/activation_functions.cpp
)

# Define executables
add_executable(linear_regression_example examples/linear_regression_example.cpp)
add_executable(kmeans_functions_example examples/kmeans_functions_example.cpp)
add_executable(activation_functions_example examples/activation_functions_example.cpp)

# Link libraries to executables
target_link_libraries(linear_regression_example PRIVATE InfernoML)
target_link_libraries(activation_functions_example PRIVATE Activations)
target_link_libraries(kmeans_functions_example PRIVATE Kmeans)

# Ensure that include directories are added for the specific targets if necessary
target_include_directories(linear_regression_example PRIVATE ${PROJECT_SOURCE_DIR}/include)
target_include_directories(kmeans_functions_example PRIVATE ${PROJECT_SOURCE_DIR}/include)
target_include_directories(activation_functions_example PRIVATE ${PROJECT_SOURCE_DIR}/include)
34 changes: 34 additions & 0 deletions examples/kmeans_functions_example.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#include <iostream>
#include <vector>
#include "algorithms/kmeans.h"

int main() {
// Define data points (each point is 1D in this example)
std::vector<std::vector<double>> data = {
{0.5}, {-0.3}, {0.8}, {-1.2}, {0.0}, {2.5}, {3.0}, {2.8}, {-2.0}, {1.5}
};

// Instantiate the KMeans class with 2 clusters
algorithms::KMeans kmeans(2);

// Fit the KMeans algorithm to the data
kmeans.fit(data);

// Retrieve cluster labels and centroids
std::vector<int> labels = kmeans.getLabels();
std::vector<std::vector<double>> centroids = kmeans.getCentroids();

// Print cluster labels for each data point
std::cout << "KMeans Labels:" << std::endl;
for (size_t i = 0; i < labels.size(); i++) {
std::cout << "Data Point " << i << " is in Cluster " << labels[i] << std::endl;
}

// Print the centroids of each cluster
std::cout << "Cluster Centroids:" << std::endl;
for (size_t i = 0; i < centroids.size(); i++) {
std::cout << "Centroid " << i << ": " << centroids[i][0] << std::endl;
}

return 0;
}
46 changes: 46 additions & 0 deletions include/algorithms/kmeans.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#ifndef KMEANS_H
#define KMEANS_H

#include <vector>

namespace algorithms{

class KMeans {
public:
// Constructor
KMeans(int k, int maxIterations = 100);

// Fit the model to the data
void fit(const std::vector<std::vector<double>>& data);

// Get cluster labels
const std::vector<int>& getLabels() const;

// Get cluster centers
const std::vector<std::vector<double>>& getCentroids() const;

private:
int k; // Number of clusters
int maxIterations; // Maximum number of iterations
std::vector<std::vector<double>> centroids; // Cluster centroids
std::vector<int> labels; // Cluster labels for each point

// Randomly initialize centroids from the data points
void initializeCentroids(const std::vector<std::vector<double>>& data, int dimensions);

// Assign each point to the nearest centroid
void assignClusters(const std::vector<std::vector<double>>& data);

// Update centroids based on the mean of the assigned points
void updateCentroids(const std::vector<std::vector<double>>& data, int dimensions);

// Calculate Euclidean distance between two points
double euclideanDistance(const std::vector<double>& p1, const std::vector<double>& p2) const;

// Check if the centroids have converged (optional, currently fixed iterations)
bool converged() const;
};

}

#endif
44 changes: 44 additions & 0 deletions include/algorithms/logistic_regression.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#ifndef LINEAR_REGRESSION_H
#define LINEAR_REGRESSION_H

#include <vector>
#include <stdexcept>

namespace algorithms {

class LinearRegression {
public:
// Constructor
LinearRegression() : m_slope(0.0), m_intercept(0.0), m_learning_rate(0.01), m_iterations(1000) {}

// Fit the model to the training data using gradient descent
void fit(const std::vector<double>& x, const std::vector<double>& y);

// Predict the output for a given input
double predict(double x) const;

// Getters for the parameters
double getSlope() const { return m_slope; }
double getIntercept() const { return m_intercept; }

// Set learning rate and number of iterations
void setLearningRate(double lr) { m_learning_rate = lr; }
void setIterations(int it) { m_iterations = it; }

private:
double m_slope;
double m_intercept;
double m_learning_rate;
int m_iterations;

// Helper function to compute the mean of a vector
double mean(const std::vector<double>& v) const;

// Helper functions for gradient descent
double computeCost(const std::vector<double>& x, const std::vector<double>& y) const;
void gradientDescent(const std::vector<double>& x, const std::vector<double>& y);
};

} // namespace algorithms

#endif // LINEAR_REGRESSION_H
116 changes: 116 additions & 0 deletions src/algorithms/kmeans.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
#include "algorithms/kmeans.h"
#include <iostream>
#include <cmath>
#include <limits>
#include <random>


namespace algorithms{

// Constructor
KMeans::KMeans(int k, int maxIterations) : k(k), maxIterations(maxIterations) {}

// Fit the model to the data
void KMeans::fit(const std::vector<std::vector<double>>& data) {
int numPoints = data.size();
int dimensions = data[0].size();

// Randomly initialize centroids
initializeCentroids(data, dimensions);

// Run the algorithm for a fixed number of iterations
for (int iteration = 0; iteration < maxIterations; iteration++) {
// Step 1: Assign points to the closest centroid
assignClusters(data);

// Step 2: Update centroids based on the points assigned to them
updateCentroids(data, dimensions);

// Check for convergence (not implemented for simplicity, fixed iterations)
if (converged()) {
break;
}
}
}

// Get cluster labels
const std::vector<int>& KMeans::getLabels() const {
return labels;
}

// Get cluster centers
const std::vector<std::vector<double>>& KMeans::getCentroids() const {
return centroids;
}

// Randomly initialize centroids from the data points
void KMeans::initializeCentroids(const std::vector<std::vector<double>>& data, int dimensions) {
centroids.resize(k, std::vector<double>(dimensions));
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<> dis(0, data.size() - 1);

for (int i = 0; i < k; i++) {
centroids[i] = data[dis(gen)];
}
}

// Assign each point to the nearest centroid
void KMeans::assignClusters(const std::vector<std::vector<double>>& data) {
labels.resize(data.size());
for (size_t i = 0; i < data.size(); i++) {
double minDist = std::numeric_limits<double>::max();
int closestCentroid = 0;
for (int j = 0; j < k; j++) {
double dist = euclideanDistance(data[i], centroids[j]);
if (dist < minDist) {
minDist = dist;
closestCentroid = j;
}
}
labels[i] = closestCentroid;
}
}

// Update centroids based on the mean of the assigned points
void KMeans::updateCentroids(const std::vector<std::vector<double>>& data, int dimensions) {
std::vector<std::vector<double>> newCentroids(k, std::vector<double>(dimensions, 0.0));
std::vector<int> pointsPerCentroid(k, 0);

// Sum the points assigned to each centroid
for (size_t i = 0; i < data.size(); i++) {
int centroidIndex = labels[i];
pointsPerCentroid[centroidIndex]++;
for (int d = 0; d < dimensions; d++) {
newCentroids[centroidIndex][d] += data[i][d];
}
}

// Update the centroids by computing the average
for (int j = 0; j < k; j++) {
if (pointsPerCentroid[j] > 0) {
for (int d = 0; d < dimensions; d++) {
newCentroids[j][d] /= pointsPerCentroid[j];
}
}
}

centroids = newCentroids;
}

// Calculate Euclidean distance between two points
double KMeans::euclideanDistance(const std::vector<double>& p1, const std::vector<double>& p2) const {
double sum = 0.0;
for (size_t i = 0; i < p1.size(); i++) {
sum += std::pow(p1[i] - p2[i], 2);
}
return std::sqrt(sum);
}

// Check if the centroids have converged (optional, currently fixed iterations)
bool KMeans::converged() const {
// This can be implemented with a threshold to check if centroids have stopped moving
return false;
}

}

0 comments on commit 2d82d91

Please sign in to comment.