1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2011, 2012, 2015, 2019 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include <gsl/gsl_matrix.h>
20 #include <gsl/gsl_permutation.h>
21 #include <gsl/gsl_sort_vector.h>
22 #include <gsl/gsl_statistics.h>
26 #include "data/case.h"
27 #include "data/casegrouper.h"
28 #include "data/casereader.h"
29 #include "data/casewriter.h"
30 #include "data/dataset.h"
31 #include "data/dictionary.h"
32 #include "data/format.h"
33 #include "data/missing-values.h"
34 #include "language/command.h"
35 #include "language/lexer/lexer.h"
36 #include "language/lexer/variable-parser.h"
37 #include "libpspp/message.h"
38 #include "libpspp/misc.h"
39 #include "libpspp/assertion.h"
40 #include "libpspp/str.h"
41 #include "math/random.h"
42 #include "output/pivot-table.h"
43 #include "output/output-item.h"
46 #define _(msgid) gettext (msgid)
47 #define N_(msgid) msgid
56 struct save_trans_data
58 /* A writer which contains the values (if any) to be appended to
59 each case in the active dataset */
60 struct casewriter *writer;
62 /* A reader created from the writer above. */
63 struct casereader *appending_reader;
65 /* The indices to be used to access values in the above,
67 int CASE_IDX_MEMBERSHIP;
68 int CASE_IDX_DISTANCE;
70 /* The variables created to hold the values appended to the dataset */
71 struct variable *membership;
72 struct variable *distance;
76 #define SAVE_MEMBERSHIP 0x1
77 #define SAVE_DISTANCE 0x2
81 struct dataset *dataset;
82 struct dictionary *dict;
84 const struct variable **vars;
87 double epsilon; /* The convergence criterion */
89 int ngroups; /* Number of group. (Given by the user) */
90 int maxiter; /* Maximum iterations (Given by the user) */
91 bool print_cluster_membership; /* true => print membership */
92 bool print_initial_clusters; /* true => print initial cluster */
93 bool no_initial; /* true => simplified initial cluster selection */
94 bool no_update; /* true => do not iterate */
96 const struct variable *wv; /* Weighting variable. */
98 enum missing_type missing_type;
99 enum mv_class exclude;
101 /* Which values are to be saved? */
104 /* The name of the new variable to contain the cluster of each case. */
105 char *var_membership;
107 /* The name of the new variable to contain the distance of each case
108 from its cluster centre. */
111 struct save_trans_data *save_trans_data;
114 /* Holds all of the information for the functions. int n, holds the number of
115 observation and its default value is -1. We set it in
116 kmeans_recalculate_centers in first invocation. */
119 gsl_matrix *centers; /* Centers for groups. */
120 gsl_matrix *updated_centers;
123 gsl_vector_long *num_elements_groups;
125 gsl_matrix *initial_centers; /* Initial random centers. */
126 double convergence_criteria;
127 gsl_permutation *group_order; /* Group order for reporting. */
130 static struct Kmeans *kmeans_create (const struct qc *qc);
132 static void kmeans_get_nearest_group (const struct Kmeans *kmeans,
133 struct ccase *c, const struct qc *,
134 int *, double *, int *, double *);
136 static void kmeans_order_groups (struct Kmeans *kmeans, const struct qc *);
138 static void kmeans_cluster (struct Kmeans *kmeans, struct casereader *reader,
141 static void quick_cluster_show_centers (struct Kmeans *kmeans, bool initial,
144 static void quick_cluster_show_membership (struct Kmeans *kmeans,
145 const struct casereader *reader,
148 static void quick_cluster_show_number_cases (struct Kmeans *kmeans,
151 static void quick_cluster_show_results (struct Kmeans *kmeans,
152 const struct casereader *reader,
155 int cmd_quick_cluster (struct lexer *lexer, struct dataset *ds);
157 static void kmeans_destroy (struct Kmeans *kmeans);
159 /* Creates and returns a struct of Kmeans with given casereader 'cs', parsed
160 variables 'variables', number of cases 'n', number of variables 'm', number
161 of clusters and amount of maximum iterations. */
162 static struct Kmeans *
163 kmeans_create (const struct qc *qc)
165 struct Kmeans *kmeans = xmalloc (sizeof (struct Kmeans));
166 kmeans->centers = gsl_matrix_alloc (qc->ngroups, qc->n_vars);
167 kmeans->updated_centers = gsl_matrix_alloc (qc->ngroups, qc->n_vars);
168 kmeans->num_elements_groups = gsl_vector_long_alloc (qc->ngroups);
169 kmeans->group_order = gsl_permutation_alloc (kmeans->centers->size1);
170 kmeans->initial_centers = NULL;
176 kmeans_destroy (struct Kmeans *kmeans)
178 gsl_matrix_free (kmeans->centers);
179 gsl_matrix_free (kmeans->updated_centers);
180 gsl_matrix_free (kmeans->initial_centers);
182 gsl_vector_long_free (kmeans->num_elements_groups);
184 gsl_permutation_free (kmeans->group_order);
190 diff_matrix (const gsl_matrix *m1, const gsl_matrix *m2)
193 double max_diff = -INFINITY;
194 for (i = 0; i < m1->size1; ++i)
197 for (j = 0; j < m1->size2; ++j)
199 diff += pow2 (gsl_matrix_get (m1,i,j) - gsl_matrix_get (m2,i,j));
211 matrix_mindist (const gsl_matrix *m, int *mn, int *mm)
214 double mindist = INFINITY;
215 for (i = 0; i < m->size1 - 1; ++i)
217 for (j = i + 1; j < m->size1; ++j)
221 for (k = 0; k < m->size2; ++k)
223 diff_sq += pow2 (gsl_matrix_get (m, j, k) - gsl_matrix_get (m, i, k));
225 if (diff_sq < mindist)
240 /* Return the distance of C from the group whose index is WHICH */
242 dist_from_case (const struct Kmeans *kmeans, const struct ccase *c,
243 const struct qc *qc, int which)
247 for (j = 0; j < qc->n_vars; j++)
249 const union value *val = case_data (c, qc->vars[j]);
250 if (var_is_value_missing (qc->vars[j], val) & qc->exclude)
253 dist += pow2 (gsl_matrix_get (kmeans->centers, which, j) - val->f);
259 /* Return the minimum distance of the group WHICH and all other groups */
261 min_dist_from (const struct Kmeans *kmeans, const struct qc *qc, int which)
265 double mindist = INFINITY;
266 for (i = 0; i < qc->ngroups; i++)
272 for (j = 0; j < qc->n_vars; j++)
274 dist += pow2 (gsl_matrix_get (kmeans->centers, i, j)
275 - gsl_matrix_get (kmeans->centers, which, j));
289 /* Calculate the initial cluster centers. */
291 kmeans_initial_centers (struct Kmeans *kmeans,
292 const struct casereader *reader,
298 struct casereader *cs = casereader_clone (reader);
299 for (; (c = casereader_read (cs)) != NULL; case_unref (c))
301 bool missing = false;
302 for (j = 0; j < qc->n_vars; ++j)
304 const union value *val = case_data (c, qc->vars[j]);
305 if (var_is_value_missing (qc->vars[j], val) & qc->exclude)
311 if (nc < qc->ngroups)
312 gsl_matrix_set (kmeans->centers, nc, j, val->f);
318 if (nc++ < qc->ngroups)
327 double m = matrix_mindist (kmeans->centers, &mn, &mm);
329 kmeans_get_nearest_group (kmeans, c, qc, &mq, &delta, &mp, NULL);
331 /* If the distance between C and the nearest group, is greater than the distance
332 between the two groups which are clostest to each
333 other, then one group must be replaced. */
335 /* Out of mn and mm, which is the clostest of the two groups to C ? */
336 int which = (dist_from_case (kmeans, c, qc, mn)
337 > dist_from_case (kmeans, c, qc, mm)) ? mm : mn;
339 for (j = 0; j < qc->n_vars; ++j)
341 const union value *val = case_data (c, qc->vars[j]);
342 gsl_matrix_set (kmeans->centers, which, j, val->f);
345 else if (dist_from_case (kmeans, c, qc, mp) > min_dist_from (kmeans, qc, mq))
346 /* If the distance between C and the second nearest group
347 (MP) is greater than the smallest distance between the
348 nearest group (MQ) and any other group, then replace
351 for (j = 0; j < qc->n_vars; ++j)
353 const union value *val = case_data (c, qc->vars[j]);
354 gsl_matrix_set (kmeans->centers, mq, j, val->f);
360 casereader_destroy (cs);
362 kmeans->convergence_criteria = qc->epsilon * matrix_mindist (kmeans->centers, NULL, NULL);
364 /* As it is the first iteration, the variable kmeans->initial_centers is NULL
365 and it is created once for reporting issues. */
366 kmeans->initial_centers = gsl_matrix_alloc (qc->ngroups, qc->n_vars);
367 gsl_matrix_memcpy (kmeans->initial_centers, kmeans->centers);
371 /* Return the index of the group which is nearest to the case C */
373 kmeans_get_nearest_group (const struct Kmeans *kmeans, struct ccase *c,
374 const struct qc *qc, int *g_q, double *delta_q,
375 int *g_p, double *delta_p)
380 double mindist0 = INFINITY;
381 double mindist1 = INFINITY;
382 for (i = 0; i < qc->ngroups; i++)
385 for (j = 0; j < qc->n_vars; j++)
387 const union value *val = case_data (c, qc->vars[j]);
388 if (var_is_value_missing (qc->vars[j], val) & qc->exclude)
391 dist += pow2 (gsl_matrix_get (kmeans->centers, i, j) - val->f);
402 else if (dist < mindist1)
426 kmeans_order_groups (struct Kmeans *kmeans, const struct qc *qc)
428 gsl_vector *v = gsl_vector_alloc (qc->ngroups);
429 gsl_matrix_get_col (v, kmeans->centers, 0);
430 gsl_sort_vector_index (kmeans->group_order, v);
435 Does iterations, checks convergency. */
437 kmeans_cluster (struct Kmeans *kmeans, struct casereader *reader,
442 kmeans_initial_centers (kmeans, reader, qc);
444 gsl_matrix_memcpy (kmeans->updated_centers, kmeans->centers);
447 for (int xx = 0 ; xx < qc->maxiter ; ++xx)
449 gsl_vector_long_set_all (kmeans->num_elements_groups, 0.0);
454 struct casereader *r = casereader_clone (reader);
456 for (; (c = casereader_read (r)) != NULL; case_unref (c))
460 bool missing = false;
462 for (j = 0; j < qc->n_vars; j++)
464 const union value *val = case_data (c, qc->vars[j]);
465 if (var_is_value_missing (qc->vars[j], val) & qc->exclude)
472 double mindist = INFINITY;
473 for (g = 0; g < qc->ngroups; ++g)
475 double d = dist_from_case (kmeans, c, qc, g);
484 long *n = gsl_vector_long_ptr (kmeans->num_elements_groups, group);
485 *n += qc->wv ? case_num (c, qc->wv) : 1.0;
488 for (j = 0; j < qc->n_vars; ++j)
490 const union value *val = case_data (c, qc->vars[j]);
491 if (var_is_value_missing (qc->vars[j], val) & qc->exclude)
493 double *x = gsl_matrix_ptr (kmeans->updated_centers, group, j);
494 *x += val->f * (qc->wv ? case_num (c, qc->wv) : 1.0);
498 casereader_destroy (r);
503 /* Divide the cluster sums by the number of items in each cluster */
504 for (g = 0; g < qc->ngroups; ++g)
506 for (j = 0; j < qc->n_vars; ++j)
508 long n = gsl_vector_long_get (kmeans->num_elements_groups, g);
509 double *x = gsl_matrix_ptr (kmeans->updated_centers, g, j);
510 *x /= n + 1; // Plus 1 for the initial centers
515 gsl_matrix_memcpy (kmeans->centers, kmeans->updated_centers);
520 gsl_vector_long_set_all (kmeans->num_elements_groups, 0.0);
521 gsl_matrix_set_all (kmeans->updated_centers, 0.0);
523 struct casereader *cs = casereader_clone (reader);
524 for (; (c = casereader_read (cs)) != NULL; case_unref (c))
527 kmeans_get_nearest_group (kmeans, c, qc, &group, NULL, NULL, NULL);
529 for (j = 0; j < qc->n_vars; ++j)
531 const union value *val = case_data (c, qc->vars[j]);
532 if (var_is_value_missing (qc->vars[j], val) & qc->exclude)
535 double *x = gsl_matrix_ptr (kmeans->updated_centers, group, j);
539 long *n = gsl_vector_long_ptr (kmeans->num_elements_groups, group);
540 *n += qc->wv ? case_num (c, qc->wv) : 1.0;
543 casereader_destroy (cs);
546 /* Divide the cluster sums by the number of items in each cluster */
547 for (g = 0; g < qc->ngroups; ++g)
549 for (j = 0; j < qc->n_vars; ++j)
551 long n = gsl_vector_long_get (kmeans->num_elements_groups, g);
552 double *x = gsl_matrix_ptr (kmeans->updated_centers, g, j);
557 double d = diff_matrix (kmeans->updated_centers, kmeans->centers);
558 if (d < kmeans->convergence_criteria)
567 /* Reports centers of clusters.
568 Initial parameter is optional for future use.
569 If initial is true, initial cluster centers are reported. Otherwise,
570 resulted centers are reported. */
572 quick_cluster_show_centers (struct Kmeans *kmeans, bool initial, const struct qc *qc)
574 struct pivot_table *table
575 = pivot_table_create (initial
576 ? N_("Initial Cluster Centers")
577 : N_("Final Cluster Centers"));
579 struct pivot_dimension *clusters
580 = pivot_dimension_create (table, PIVOT_AXIS_COLUMN, N_("Cluster"));
582 clusters->root->show_label = true;
583 for (size_t i = 0; i < qc->ngroups; i++)
584 pivot_category_create_leaf (clusters->root,
585 pivot_value_new_integer (i + 1));
587 struct pivot_dimension *variables
588 = pivot_dimension_create (table, PIVOT_AXIS_ROW, N_("Variable"));
590 for (size_t i = 0; i < qc->n_vars; i++)
591 pivot_category_create_leaf (variables->root,
592 pivot_value_new_variable (qc->vars[i]));
594 const gsl_matrix *matrix = (initial
595 ? kmeans->initial_centers
597 for (size_t i = 0; i < qc->ngroups; i++)
598 for (size_t j = 0; j < qc->n_vars; j++)
600 double x = gsl_matrix_get (matrix, kmeans->group_order->data[i], j);
601 union value v = { .f = x };
602 pivot_table_put2 (table, i, j,
603 pivot_value_new_var_value (qc->vars[j], &v));
606 pivot_table_submit (table);
610 /* A transformation function which juxtaposes the dataset with the
611 (pre-prepared) dataset containing membership and/or distance
613 static enum trns_result
614 save_trans_func (void *aux, struct ccase **c, casenumber x UNUSED)
616 const struct save_trans_data *std = aux;
617 struct ccase *ca = casereader_read (std->appending_reader);
619 return TRNS_CONTINUE;
621 *c = case_unshare (*c);
623 if (std->CASE_IDX_MEMBERSHIP >= 0)
624 *case_num_rw (*c, std->membership) = case_num_idx (ca, std->CASE_IDX_MEMBERSHIP);
626 if (std->CASE_IDX_DISTANCE >= 0)
627 *case_num_rw (*c, std->distance) = case_num_idx (ca, std->CASE_IDX_DISTANCE);
631 return TRNS_CONTINUE;
635 /* Free the resources of the transformation. */
637 save_trans_destroy (void *aux)
639 struct save_trans_data *std = aux;
640 casereader_destroy (std->appending_reader);
646 /* Reports cluster membership for each case, and is requested
647 saves the membership and the distance of the case from the cluster
650 quick_cluster_show_membership (struct Kmeans *kmeans,
651 const struct casereader *reader,
654 struct pivot_table *table = NULL;
655 struct pivot_dimension *cases = NULL;
656 if (qc->print_cluster_membership)
658 table = pivot_table_create (N_("Cluster Membership"));
660 pivot_dimension_create (table, PIVOT_AXIS_COLUMN, N_("Cluster"),
664 = pivot_dimension_create (table, PIVOT_AXIS_ROW, N_("Case Number"));
666 cases->root->show_label = true;
669 gsl_permutation *ip = gsl_permutation_alloc (qc->ngroups);
670 gsl_permutation_inverse (ip, kmeans->group_order);
672 struct caseproto *proto = caseproto_create ();
675 /* Prepare data which may potentially be used in a
676 transformation appending new variables to the active
678 qc->save_trans_data = xzalloc (sizeof *qc->save_trans_data);
679 qc->save_trans_data->CASE_IDX_MEMBERSHIP = -1;
680 qc->save_trans_data->CASE_IDX_DISTANCE = -1;
681 qc->save_trans_data->writer = autopaging_writer_create (proto);
684 if (qc->save_values & SAVE_MEMBERSHIP)
686 proto = caseproto_add_width (proto, 0);
687 qc->save_trans_data->CASE_IDX_MEMBERSHIP = idx++;
690 if (qc->save_values & SAVE_DISTANCE)
692 proto = caseproto_add_width (proto, 0);
693 qc->save_trans_data->CASE_IDX_DISTANCE = idx++;
697 struct casereader *cs = casereader_clone (reader);
699 for (int i = 0; (c = casereader_read (cs)) != NULL; i++, case_unref (c))
701 assert (i < kmeans->n);
703 kmeans_get_nearest_group (kmeans, c, qc, &clust, NULL, NULL, NULL);
704 int cluster = ip->data[clust];
706 if (qc->save_trans_data)
708 /* Calculate the membership and distance values. */
709 struct ccase *outc = case_create (proto);
710 if (qc->save_values & SAVE_MEMBERSHIP)
711 *case_num_rw_idx (outc, qc->save_trans_data->CASE_IDX_MEMBERSHIP) = cluster + 1;
713 if (qc->save_values & SAVE_DISTANCE)
714 *case_num_rw_idx (outc, qc->save_trans_data->CASE_IDX_DISTANCE)
715 = sqrt (dist_from_case (kmeans, c, qc, clust));
717 casewriter_write (qc->save_trans_data->writer, outc);
720 if (qc->print_cluster_membership)
722 /* Print the cluster membership to the table. */
723 int case_idx = pivot_category_create_leaf (cases->root,
724 pivot_value_new_integer (i + 1));
725 pivot_table_put2 (table, 0, case_idx,
726 pivot_value_new_integer (cluster + 1));
730 caseproto_unref (proto);
731 gsl_permutation_free (ip);
733 if (qc->print_cluster_membership)
734 pivot_table_submit (table);
735 casereader_destroy (cs);
739 /* Reports number of cases of each single cluster. */
741 quick_cluster_show_number_cases (struct Kmeans *kmeans, const struct qc *qc)
743 struct pivot_table *table
744 = pivot_table_create (N_("Number of Cases in each Cluster"));
746 pivot_dimension_create (table, PIVOT_AXIS_COLUMN, N_("Statistics"),
749 struct pivot_dimension *clusters
750 = pivot_dimension_create (table, PIVOT_AXIS_ROW, N_("Clusters"));
752 struct pivot_category *group
753 = pivot_category_create_group (clusters->root, N_("Cluster"));
756 for (int i = 0; i < qc->ngroups; i++)
759 = pivot_category_create_leaf (group, pivot_value_new_integer (i + 1));
760 int count = kmeans->num_elements_groups->data [kmeans->group_order->data[i]];
761 pivot_table_put2 (table, 0, cluster_idx, pivot_value_new_integer (count));
765 int cluster_idx = pivot_category_create_leaf (clusters->root,
766 pivot_value_new_text (N_("Valid")));
767 pivot_table_put2 (table, 0, cluster_idx, pivot_value_new_integer (total));
768 pivot_table_submit (table);
773 quick_cluster_show_results (struct Kmeans *kmeans, const struct casereader *reader,
776 kmeans_order_groups (kmeans, qc); /* what does this do? */
778 if (qc->print_initial_clusters)
779 quick_cluster_show_centers (kmeans, true, qc);
780 quick_cluster_show_centers (kmeans, false, qc);
781 quick_cluster_show_number_cases (kmeans, qc);
783 quick_cluster_show_membership (kmeans, reader, qc);
786 /* Parse the QUICK CLUSTER command and populate QC accordingly.
787 Returns false on error. */
789 quick_cluster_parse (struct lexer *lexer, struct qc *qc)
791 if (!parse_variables_const (lexer, qc->dict, &qc->vars, &qc->n_vars,
792 PV_NO_DUPLICATE | PV_NUMERIC))
794 return (CMD_FAILURE);
797 while (lex_token (lexer) != T_ENDCMD)
799 lex_match (lexer, T_SLASH);
801 if (lex_match_id (lexer, "MISSING"))
803 lex_match (lexer, T_EQUALS);
804 while (lex_token (lexer) != T_ENDCMD
805 && lex_token (lexer) != T_SLASH)
807 if (lex_match_id (lexer, "LISTWISE")
808 || lex_match_id (lexer, "DEFAULT"))
810 qc->missing_type = MISS_LISTWISE;
812 else if (lex_match_id (lexer, "PAIRWISE"))
814 qc->missing_type = MISS_PAIRWISE;
816 else if (lex_match_id (lexer, "INCLUDE"))
818 qc->exclude = MV_SYSTEM;
820 else if (lex_match_id (lexer, "EXCLUDE"))
822 qc->exclude = MV_ANY;
826 lex_error (lexer, NULL);
831 else if (lex_match_id (lexer, "PRINT"))
833 lex_match (lexer, T_EQUALS);
834 while (lex_token (lexer) != T_ENDCMD
835 && lex_token (lexer) != T_SLASH)
837 if (lex_match_id (lexer, "CLUSTER"))
838 qc->print_cluster_membership = true;
839 else if (lex_match_id (lexer, "INITIAL"))
840 qc->print_initial_clusters = true;
843 lex_error (lexer, NULL);
848 else if (lex_match_id (lexer, "SAVE"))
850 lex_match (lexer, T_EQUALS);
851 while (lex_token (lexer) != T_ENDCMD
852 && lex_token (lexer) != T_SLASH)
854 if (lex_match_id (lexer, "CLUSTER"))
856 qc->save_values |= SAVE_MEMBERSHIP;
857 if (lex_match (lexer, T_LPAREN))
859 if (!lex_force_id (lexer))
862 free (qc->var_membership);
863 qc->var_membership = xstrdup (lex_tokcstr (lexer));
864 if (NULL != dict_lookup_var (qc->dict, qc->var_membership))
867 _("A variable called `%s' already exists."),
869 free (qc->var_membership);
870 qc->var_membership = NULL;
876 if (!lex_force_match (lexer, T_RPAREN))
880 else if (lex_match_id (lexer, "DISTANCE"))
882 qc->save_values |= SAVE_DISTANCE;
883 if (lex_match (lexer, T_LPAREN))
885 if (!lex_force_id (lexer))
888 free (qc->var_distance);
889 qc->var_distance = xstrdup (lex_tokcstr (lexer));
890 if (NULL != dict_lookup_var (qc->dict, qc->var_distance))
893 _("A variable called `%s' already exists."),
895 free (qc->var_distance);
896 qc->var_distance = NULL;
902 if (!lex_force_match (lexer, T_RPAREN))
908 lex_error (lexer, _("Expecting %s or %s."),
909 "CLUSTER", "DISTANCE");
914 else if (lex_match_id (lexer, "CRITERIA"))
916 lex_match (lexer, T_EQUALS);
917 while (lex_token (lexer) != T_ENDCMD
918 && lex_token (lexer) != T_SLASH)
920 if (lex_match_id (lexer, "CLUSTERS"))
922 if (lex_force_match (lexer, T_LPAREN) &&
923 lex_force_int_range (lexer, "CLUSTERS", 1, INT_MAX))
925 qc->ngroups = lex_integer (lexer);
927 if (!lex_force_match (lexer, T_RPAREN))
931 else if (lex_match_id (lexer, "CONVERGE"))
933 if (lex_force_match (lexer, T_LPAREN) &&
934 lex_force_num_range_open (lexer, "CONVERGE", 0, DBL_MAX))
936 qc->epsilon = lex_number (lexer);
938 if (!lex_force_match (lexer, T_RPAREN))
942 else if (lex_match_id (lexer, "MXITER"))
944 if (lex_force_match (lexer, T_LPAREN) &&
945 lex_force_int_range (lexer, "MXITER", 1, INT_MAX))
947 qc->maxiter = lex_integer (lexer);
949 if (!lex_force_match (lexer, T_RPAREN))
953 else if (lex_match_id (lexer, "NOINITIAL"))
955 qc->no_initial = true;
957 else if (lex_match_id (lexer, "NOUPDATE"))
959 qc->no_update = true;
963 lex_error (lexer, NULL);
970 lex_error (lexer, NULL);
978 cmd_quick_cluster (struct lexer *lexer, struct dataset *ds)
981 struct Kmeans *kmeans;
983 memset (&qc, 0, sizeof qc);
985 qc.dict = dataset_dict (ds);
988 qc.epsilon = DBL_EPSILON;
989 qc.missing_type = MISS_LISTWISE;
993 if (!quick_cluster_parse (lexer, &qc))
996 qc.wv = dict_get_weight (qc.dict);
999 struct casereader *group;
1000 struct casegrouper *grouper = casegrouper_create_splits (proc_open (ds), qc.dict);
1002 while (casegrouper_get_next_group (grouper, &group))
1004 if (qc.missing_type == MISS_LISTWISE)
1006 group = casereader_create_filter_missing (group, qc.vars, qc.n_vars,
1011 kmeans = kmeans_create (&qc);
1012 kmeans_cluster (kmeans, group, &qc);
1013 quick_cluster_show_results (kmeans, group, &qc);
1014 kmeans_destroy (kmeans);
1015 casereader_destroy (group);
1017 ok = casegrouper_destroy (grouper);
1019 ok = proc_commit (ds) && ok;
1022 /* If requested, set a transformation to append the cluster and
1023 distance values to the current dataset. */
1024 if (qc.save_trans_data)
1026 struct save_trans_data *std = qc.save_trans_data;
1027 std->appending_reader = casewriter_make_reader (std->writer);
1030 if (qc.save_values & SAVE_MEMBERSHIP)
1032 /* Invent a variable name if necessary. */
1035 ds_init_empty (&name);
1036 while (qc.var_membership == NULL)
1039 ds_put_format (&name, "QCL_%d", idx++);
1041 if (!dict_lookup_var (qc.dict, ds_cstr (&name)))
1043 qc.var_membership = strdup (ds_cstr (&name));
1049 std->membership = dict_create_var_assert (qc.dict, qc.var_membership, 0);
1052 if (qc.save_values & SAVE_DISTANCE)
1054 /* Invent a variable name if necessary. */
1057 ds_init_empty (&name);
1058 while (qc.var_distance == NULL)
1061 ds_put_format (&name, "QCL_%d", idx++);
1063 if (!dict_lookup_var (qc.dict, ds_cstr (&name)))
1065 qc.var_distance = strdup (ds_cstr (&name));
1071 std->distance = dict_create_var_assert (qc.dict, qc.var_distance, 0);
1074 static const struct trns_class trns_class = {
1075 .name = "QUICK CLUSTER",
1076 .execute = save_trans_func,
1077 .destroy = save_trans_destroy,
1079 add_transformation (qc.dataset, &trns_class, std);
1082 free (qc.var_distance);
1083 free (qc.var_membership);
1088 free (qc.var_distance);
1089 free (qc.var_membership);