59 using namespace UTILSLIB;
60 using namespace Eigen;
72 : m_sDistance(distance.toStdString())
73 , m_sStart(start.toStdString())
75 , m_sEmptyact(emptyact.toStdString())
131 srand ( time(NULL) );
138 if(m_sDistance.compare(
"cosine") == 0)
148 else if(m_sDistance.compare(
"correlation")==0)
150 X.array() -= (X.rowwise().sum().array() / (double)p).replicate(1,p);
151 MatrixXd Xnorm = (X.array().pow(2).rowwise().sum()).sqrt();
157 X.array() /= Xnorm.replicate(1,p).array();
169 if (m_sStart.compare(
"uniform") == 0)
171 if (m_sDistance.compare(
"hamming") == 0)
173 printf(
"Error: Uniform Start For Hamming\n");
176 Xmins = X.colwise().minCoeff();
177 Xmaxs = X.colwise().maxCoeff();
186 Del.fill(std::numeric_limits<double>::quiet_NaN());
189 double totsumDBest = std::numeric_limits<double>::max();
197 for(qint32 rep = 0; rep < m_iReps; ++rep)
199 if (m_sStart.compare(
"uniform") == 0)
201 C = MatrixXd::Zero(k,p);
202 for(qint32 i = 0; i < k; ++i)
203 for(qint32 j = 0; j < p; ++j)
204 C(i,j) = unifrnd(Xmins[j], Xmaxs[j]);
209 if (m_sDistance.compare(
"correlation") == 0)
210 C.array() -= (C.array().rowwise().sum()/p).replicate(1, p).array();
212 else if (m_sStart.compare(
"sample") == 0)
214 C = MatrixXd::Zero(k,p);
215 for(qint32 i = 0; i < k; ++i)
216 C.block(i,0,1,p) = X.block(rand() % n, 0, 1, p);
235 idx = VectorXi::Zero(D.rows());
236 d = VectorXd::Zero(D.rows());
238 for(qint32 i = 0; i < D.rows(); ++i)
239 d[i] = D.row(i).minCoeff(&idx[i]);
241 m = VectorXi::Zero(k);
242 for(qint32 i = 0; i < k; ++i)
243 for (qint32 j = 0; j < idx.rows(); ++j)
250 bool converged = batchUpdate(X, C, idx);
254 converged = onlineUpdate(X, C, idx);
257 printf(
"Failed To Converge during replicate %d\n", rep);
260 VectorXi nonempties = VectorXi::Zero(m.rows());
262 for(qint32 i = 0; i < m.rows(); ++i)
270 MatrixXd C_tmp(count,C.cols());
272 for(qint32 i = 0; i < nonempties.rows(); ++i)
276 C_tmp.row(count) = C.row(i);
281 MatrixXd D_tmp = distfun(X, C_tmp);
283 for(qint32 i = 0; i < nonempties.rows(); ++i)
287 D.col(i) = D_tmp.col(count);
288 C.row(i) = C_tmp.row(count);
293 d = VectorXd::Zero(n);
294 for(qint32 i = 0; i < n; ++i)
295 d[i] += D.array()(idx[i]*n+i);
297 sumD = VectorXd::Zero(
k);
298 for(qint32 i = 0; i <
k; ++i)
299 for (qint32 j = 0; j < idx.rows(); ++j)
303 totsumD = sumD.array().sum();
308 if (totsumD < totsumDBest)
310 totsumDBest = totsumD;
328 emptyErrCnt = emptyErrCnt + 1;
330 if (emptyErrCnt == m_iReps)
356 bool KMeans::batchUpdate(
const MatrixXd& X, MatrixXd& C, VectorXi& idx)
361 for(i = 0; i < n; ++i)
365 for(i = 0; i < k; ++i)
368 previdx = VectorXi::Zero(n);
370 prevtotsumD = std::numeric_limits<double>::max();
372 MatrixXd D = MatrixXd::Zero(X.rows(), k);
378 bool converged =
false;
387 KMeans::gcentroids(X, idx, changed, C_new, m_new);
388 MatrixXd D_new = distfun(X, C_new);
390 for(qint32 i = 0; i < changed.rows(); ++i)
392 C.row(changed[i]) = C_new.row(i);
393 D.col(changed[i]) = D_new.col(i);
394 m[changed[i]] = m_new[i];
398 VectorXi empties = VectorXi::Zero(changed.rows());
399 for(qint32 i = 0; i < changed.rows(); ++i)
403 if (empties.sum() > 0)
405 if (m_sEmptyact.compare(
"error") == 0)
410 else if (m_sEmptyact.compare(
"drop") == 0)
417 else if (m_sEmptyact.compare(
"singleton") == 0)
450 for(qint32 i = 0; i < n; ++i)
451 totsumD += D.array()(idx[i]*n+i);
454 if(prevtotsumD <= totsumD)
459 gcentroids(X, idx, changed, C_new, m_new);
460 C.block(0,0,k,C.cols()) = C_new;
461 m.block(0,0,k,1) = m_new;
467 if (iter >= m_iMaxit)
472 prevtotsumD = totsumD;
474 VectorXi nidx(D.rows());
475 for(qint32 i = 0; i < D.rows(); ++i)
476 d[i] = D.row(i).minCoeff(&nidx[i]);
479 VectorXi moved = VectorXi::Zero(nidx.rows());
481 for(qint32 i = 0; i < nidx.rows(); ++i)
483 if(nidx[i] != previdx[i])
489 moved.conservativeResize(count);
491 if (moved.rows() > 0)
494 VectorXi moved_new = VectorXi::Zero(moved.rows());
496 for(qint32 i = 0; i < moved.rows(); ++i)
498 if(D.array()(previdx[moved[i]] * n + moved[i]) > d[moved[i]])
500 moved_new[count] = moved[i];
504 moved_new.conservativeResize(count);
508 if (moved.rows() == 0)
514 for(qint32 i = 0; i < moved.rows(); ++i)
516 idx[ moved[i] ] = nidx[ moved[i] ];
519 std::vector<int> tmp;
520 for(qint32 i = 0; i < moved.rows(); ++i)
521 tmp.push_back(idx[moved[i]]);
522 for(qint32 i = 0; i < moved.rows(); ++i)
523 tmp.push_back(previdx[moved[i]]);
525 std::sort(tmp.begin(),tmp.end());
527 std::vector<int>::iterator it;
528 it = std::unique(tmp.begin(),tmp.end());
529 tmp.resize( it - tmp.begin() );
531 changed.conservativeResize(tmp.size());
533 for(quint32 i = 0; i < tmp.size(); ++i)
541 bool KMeans::onlineUpdate(
const MatrixXd& X, MatrixXd& C, VectorXi& idx)
546 if (m_sDistance.compare(
"cityblock") == 0)
548 Xmid1 = MatrixXd::Zero(k,p);
549 Xmid2 = MatrixXd::Zero(k,p);
550 for(qint32 i = 0; i <
k; ++i)
556 MatrixXd Xsorted(m[i],p);
558 for(qint32 j = 0; j < idx.rows(); ++j)
562 Xsorted.row(c) = X.row(j);
566 for(qint32 j = 0; j < Xsorted.cols(); ++j)
567 std::sort(Xsorted.col(j).data(),Xsorted.col(j).data()+Xsorted.rows());
569 qint32 nn = floor(0.5*m[i])-1;
572 Xmid1.row(i) = Xsorted.row(nn);
573 Xmid2.row(i) = Xsorted.row(nn+1);
577 Xmid1.row(i) = Xsorted.row(nn);
578 Xmid2.row(i) = Xsorted.row(nn+2);
582 Xmid1.row(i) = Xsorted.row(0);
583 Xmid2.row(i) = Xsorted.row(0);
588 else if (m_sDistance.compare(
"hamming") == 0)
602 VectorXi changed = VectorXi(m.rows());
604 for(qint32 i = 0; i < m.rows(); ++i)
612 changed.conservativeResize(count);
614 qint32 lastmoved = 0;
617 bool converged =
false;
618 while (iter < m_iMaxit)
631 if (m_sDistance.compare(
"sqeuclidean") == 0)
633 for(qint32 j = 0; j < changed.rows(); ++j)
635 qint32 i = changed[j];
636 VectorXi mbrs = VectorXi::Zero(idx.rows());
637 for(qint32 l = 0; l < idx.rows(); ++l)
641 VectorXi sgn = 1 - 2 * mbrs.array();
644 for(qint32 l = 0; l < mbrs.rows(); ++l)
648 Del.col(i) = ((double)m[i] / ((
double)m[i] + sgn.cast<
double>().array()));
650 Del.col(i).array() *= (X - C.row(i).replicate(n,1)).array().pow(2).rowwise().sum().array();
653 else if (m_sDistance.compare(
"cityblock") == 0)
655 for(qint32 j = 0; j < changed.rows(); ++j)
657 qint32 i = changed[j];
660 MatrixXd ldist = Xmid1.row(i).replicate(n,1) - X;
661 MatrixXd rdist = X - Xmid2.row(i).replicate(n,1);
662 VectorXd mbrs = VectorXd::Zero(idx.rows());
664 for(qint32 l = 0; l < idx.rows(); ++l)
667 MatrixXd sgn = ((-2*mbrs).array() + 1).replicate(1, p);
668 rdist = sgn.array()*rdist.array(); ldist = sgn.array()*ldist.array();
670 for(qint32 l = 0; l < idx.rows(); ++l)
673 for(qint32 h = 0; h < rdist.cols(); ++h)
674 sum += rdist(l,h) > ldist(l,h) ? rdist(l,h) < 0 ? 0 : rdist(l,h) : ldist(l,h) < 0 ? 0 : ldist(l,h);
679 Del.col(i) = ((X - C.row(i).replicate(n,1)).array().abs()).rowwise().sum();
682 else if (m_sDistance.compare(
"cosine") == 0 || m_sDistance.compare(
"correlation") == 0)
685 MatrixXd normC = C.array().pow(2).rowwise().sum().sqrt();
692 for(qint32 j = 0; j < changed.rows(); ++j)
695 XCi = X * C.row(i).transpose();
697 VectorXi mbrs = VectorXi::Zero(idx.rows());
698 for(qint32 l = 0; l < idx.rows(); ++l)
702 VectorXi sgn = 1 - 2 * mbrs.array();
704 double A = (double)m[i] * normC(i,0);
705 double B = pow(((
double)m[i] * normC(i,0)),2);
707 Del.col(i) = 1 + sgn.cast<
double>().array()*
708 (A - (B + 2 * sgn.cast<
double>().array() * m[i] * XCi.array() + 1).sqrt());
710 std::cout <<
"Del.col(i)\n" << Del.col(i) << std::endl;
716 else if (m_sDistance.compare(
"hamming") == 0)
737 prevtotsumD = totsumD;
739 VectorXi nidx = VectorXi::Zero(Del.rows());
740 VectorXd minDel = VectorXd::Zero(Del.rows());
742 for(qint32 i = 0; i < Del.rows(); ++i)
743 minDel[i] = Del.row(i).minCoeff(&nidx[i]);
745 VectorXi moved = VectorXi::Zero(previdx.rows());
747 for(qint32 i = 0; i < moved.rows(); ++i)
749 if(previdx[i] != nidx[i])
755 moved.conservativeResize(count);
760 VectorXi moved_new = VectorXi::Zero(moved.rows());
762 for(qint32 i = 0; i < moved.rows(); ++i)
764 if ( Del.array()(previdx[moved(i)]*n + moved(i)) > minDel(moved(i)))
766 moved_new[count] = moved[i];
770 moved_new.conservativeResize(count);
774 if (moved.rows() <= 0)
778 if ((iter == iter1) || nummoved > 0)
789 VectorXi moved_new(moved.rows());
790 for(qint32 i = 0; i < moved.rows(); ++i)
791 moved_new[i] = ((moved[i] - lastmoved) % n) + lastmoved;
793 moved[0] = moved_new.minCoeff() % n;
794 moved.conservativeResize(1);
797 if (moved[0] <= lastmoved)
806 lastmoved = moved[0];
808 qint32 oidx = idx(moved[0]);
809 nidx[0] = nidx(moved[0]);
810 nidx.conservativeResize(1);
811 totsumD += Del(moved[0],nidx[0]) - Del(moved[0],oidx);
815 idx[ moved[0] ] = nidx[0];
816 m( nidx[0] ) = m( nidx[0] ) + 1;
817 m( oidx ) = m( oidx ) - 1;
819 if (m_sDistance.compare(
"sqeuclidean") == 0)
821 C.row(nidx[0]) = C.row(nidx[0]).array() + (X.row(moved[0]) - C.row(nidx[0])).array() / m[nidx[0]];
822 C.row(oidx) = C.row(oidx).array() - (X.row(moved[0]) - C.row(oidx)).array() / m[oidx];
824 else if (m_sDistance.compare(
"cityblock") == 0)
827 onidx << oidx, nidx[0];
830 for(qint32 h = 0; h < 2; ++h)
836 MatrixXd Xsorted(m[i],p);
838 for(qint32 j = 0; j < idx.rows(); ++j)
842 Xsorted.row(c) = X.row(j);
846 for(qint32 j = 0; j < Xsorted.cols(); ++j)
847 std::sort(Xsorted.col(j).data(),Xsorted.col(j).data()+Xsorted.rows());
849 qint32 nn = floor(0.5*m[i])-1;
852 C.row(i) = 0.5 * (Xsorted.row(nn) + Xsorted.row(nn+1));
853 Xmid1.row(i) = Xsorted.row(nn);
854 Xmid2.row(i) = Xsorted.row(nn+1);
858 C.row(i) = Xsorted.row(nn+1);
861 Xmid1.row(i) = Xsorted.row(nn);
862 Xmid2.row(i) = Xsorted.row(nn+2);
866 Xmid1.row(i) = Xsorted.row(0);
867 Xmid2.row(i) = Xsorted.row(0);
872 else if (m_sDistance.compare(
"cosine") == 0 || m_sDistance.compare(
"correlation") == 0)
874 C.row(nidx[0]).array() += (X.row(moved[0]) - C.row(nidx[0])).array() / m[nidx[0]];
875 C.row(oidx).array() += (X.row(moved[0]) - C.row(oidx)).array() / m[oidx];
877 else if (m_sDistance.compare(
"hamming") == 0)
887 VectorXi sorted_onidx(1+nidx.rows());
888 sorted_onidx << oidx, nidx;
889 std::sort(sorted_onidx.data(), sorted_onidx.data()+sorted_onidx.rows());
890 changed = sorted_onidx;
898 MatrixXd KMeans::distfun(
const MatrixXd& X, MatrixXd& C)
900 MatrixXd D = MatrixXd::Zero(n,C.rows());
901 qint32 nclusts = C.rows();
903 if (m_sDistance.compare(
"sqeuclidean") == 0)
905 for(qint32 i = 0; i < nclusts; ++i)
907 D.col(i) = (X.col(0).array() - C(i,0)).pow(2);
909 for(qint32 j = 1; j < p; ++j)
910 D.col(i) = D.col(i).array() + (X.col(j).array() - C(i,j)).pow(2);
913 else if (m_sDistance.compare(
"cityblock") == 0)
915 for(qint32 i = 0; i < nclusts; ++i)
917 D.col(i) = (X.col(0).array() - C(i,0)).array().abs();
918 for(qint32 j = 1; j < p; ++j)
920 D.col(i).array() += (X.col(j).array() - C(i,j)).array().abs();
924 else if (m_sDistance.compare(
"cosine") == 0 || m_sDistance.compare(
"correlation") == 0)
927 MatrixXd normC = C.array().pow(2).rowwise().sum().sqrt();
930 for (qint32 i = 0; i < nclusts; ++i)
932 MatrixXd C_tmp = (C.row(i).array() / normC(i,0)).transpose();
933 D.col(i) = X * C_tmp;
934 for(qint32 j = 0; j < D.rows(); ++j)
954 void KMeans::gcentroids(
const MatrixXd& X,
const VectorXi& index,
const VectorXi& clusts,
955 MatrixXd& centroids, VectorXi& counts)
957 qint32 num = clusts.rows();
958 centroids = MatrixXd::Zero(num,p);
959 centroids.fill(std::numeric_limits<double>::quiet_NaN());
960 counts = VectorXi::Zero(num);
966 for(qint32 i = 0; i < num; ++i)
969 members = VectorXi::Zero(index.rows());
970 for(qint32 j = 0; j < index.rows(); ++j)
972 if(index[j] == clusts[i])
978 members.conservativeResize(c);
982 if(m_sDistance.compare(
"sqeuclidean") == 0)
985 if(members.rows() > 0)
986 centroids.row(i) = RowVectorXd::Zero(centroids.cols());
988 for(qint32 j = 0; j < members.rows(); ++j)
989 centroids.row(i).array() += X.row(members[j]).array() / counts[i];
991 else if(m_sDistance.compare(
"cityblock") == 0)
995 MatrixXd Xsorted(counts[i],p);
998 for(qint32 j = 0; j < index.rows(); ++j)
1000 if(index[j] == clusts[i])
1002 Xsorted.row(c) = X.row(j);
1007 for(qint32 j = 0; j < Xsorted.cols(); ++j)
1008 std::sort(Xsorted.col(j).data(),Xsorted.col(j).data()+Xsorted.rows());
1010 qint32 nn = floor(0.5*(counts(i)))-1;
1011 if (counts[i] % 2 == 0)
1012 centroids.row(i) = .5 * (Xsorted.row(nn) + Xsorted.row(nn+1));
1014 centroids.row(i) = Xsorted.row(nn+1);
1016 else if(m_sDistance.compare(
"cosine") == 0 || m_sDistance.compare(
"correlation") == 0)
1018 for(qint32 j = 0; j < members.rows(); ++j)
1019 centroids.row(i).array() += X.row(members[j]).array() / counts[i];
1032 double KMeans::unifrnd(
double a,
double b)
1035 return std::numeric_limits<double>::quiet_NaN();
1042 double r = mu + sig * (2.0* (rand() % 1000)/1000 -1.0);