-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathDataPoint.cpp
More file actions
106 lines (93 loc) · 2.86 KB
/
DataPoint.cpp
File metadata and controls
106 lines (93 loc) · 2.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
//
// Created by dhruva on 4/9/25.
//
#include "DataPoint.h"
#include <iostream>
#include <cmath>
#include <ostream>
#include <valarray>
#include <sstream>
#include <map>
std::string DataPoint::toString() const
{
std::ostringstream oss;
oss << "DataPoint(label: \"" << label << "\", features: [";
for (size_t i = 0; i < features.size(); ++i)
{
oss << features[i];
if (i != features.size() - 1)
{
oss << ", ";
}
}
oss << "])";
return oss.str();
}
// Calculates Euclidean distance between DataPoint a and Datapoint B
// throws an error if the number of dimensions of the features do not match
double DataPoint::calculateEuclideanDistance(const DataPoint& a, const DataPoint& b)
{
const auto vector_a = a.features;
const auto vector_b = b.features;
// check dimensions
if (vector_a.size() != vector_b.size())
{
std::cerr << "Vectors have different sizes" << std::endl;
throw std::invalid_argument("Vectors have different sizes");
}
double sum_of_diff_square = 0;
for (size_t i = 0; i < vector_a.size(); i++)
{
auto diff = vector_a[i] - vector_b[i];
sum_of_diff_square += diff * diff;
}
return sqrt(sum_of_diff_square);
}
// Find the nearest k neighbours. Calculates distance to all points.
// TODO: optimize this further
std::vector<DataPoint> DataPoint::findKClosestPoints(const DataPoint& datapoint,
const std::vector<DataPoint>& dataPoints, int k)
{
using Neighbour = std::pair<DataPoint, double>;
std::vector<Neighbour> neighbours;
neighbours.reserve(dataPoints.size());
for (const DataPoint& point : dataPoints)
{
double distance = calculateEuclideanDistance(datapoint, point);
neighbours.emplace_back(point, distance);
}
// sort by distance
std::ranges::sort(neighbours, [](const Neighbour& a, const Neighbour& b) { return a.second < b.second; });
// pick only the first k entries and return them
neighbours.resize(k);
std::vector<DataPoint> closestPoints;
for (size_t i = 0; i < k; i++)
{
closestPoints.push_back(neighbours[i].first);
}
return closestPoints;
}
std::string DataPoint::findMajorityLabel(const std::vector<DataPoint>& neighbours)
{
if (neighbours.empty())
{
throw std::invalid_argument("No neighbours found");
}
std::map<std::string, int> labelCounts;
for (const DataPoint& point : neighbours)
{
labelCounts[point.label]++;
}
// find the label with the highest count
std::string majorityLabel;
int majorityLabelCount = 0;
for (const auto& pair : labelCounts)
{
if (pair.second > majorityLabelCount)
{
majorityLabelCount = pair.second;
majorityLabel = pair.first;
}
}
return majorityLabel;
}