Merge pull request #8556 from jackyalbo/jacky_remove_kmeans

Removing K-means and test performance from node allocator
noobaa · Nov 27, 2024 · 6d65c1b · 6d65c1b
2 parents e5e6a35 + 5b30564
commit 6d65c1b
Show file tree

Hide file tree

Showing 6 changed files with 15 additions and 250 deletions.
diff --git a/config.js b/config.js
@@ -64,7 +64,7 @@ config.BUFFERS_MEM_LIMIT_MIN = 32 * 1024 * 1024; // just some workable minimum s
 config.BUFFERS_MEM_LIMIT_MAX = 4 * 1024 * 1024 * 1024;
 config.BUFFERS_MEM_LIMIT = Math.min(
     config.BUFFERS_MEM_LIMIT_MAX,
-    Math.max(Math.floor(config.CONTAINER_MEM_LIMIT / 4), config.BUFFERS_MEM_LIMIT_MIN,)
+    Math.max(Math.floor(config.CONTAINER_MEM_LIMIT / 4), config.BUFFERS_MEM_LIMIT_MIN, )
 );
 
 ////////////////////////
@@ -104,7 +104,6 @@ config.AGENT_HEARTBEAT_GRACE_TIME = 10 * 60 * 1000; // grace period before an ag
 config.CLOUD_ALERT_GRACE_TIME = 3 * 60 * 1000; // grace period before dispatching alert on cloud node status
 config.AGENT_RESPONSE_TIMEOUT = 1 * 60 * 1000;
 config.AGENT_TEST_CONNECTION_TIMEOUT = 1 * 60 * 1000;
-config.STORE_PERF_TEST_INTERVAL = 60 * 60 * 1000; // perform test_store_perf every 1 hour
 config.CLOUD_MAX_ALLOWED_IO_TEST_ERRORS = 3;
 
 config.ENABLE_DEV_RANDOM_SEED = process.env.DISABLE_DEV_RANDOM_SEED === 'false' || false;
@@ -755,10 +754,10 @@ config.NSFS_BUF_POOL_MEM_LIMIT_S = Math.min(Math.floor(config.NSFS_MAX_MEM_SIZE_
     config.NSFS_WANTED_BUFFERS_NUMBER) * config.NSFS_BUF_SIZE_S;
 // Semaphore size will give 90% of remainning memory to large buffer size, 10% to medium
 config.NSFS_BUF_POOL_MEM_LIMIT_M = range_utils.align_down((config.BUFFERS_MEM_LIMIT -
-    config.NSFS_BUF_POOL_MEM_LIMIT_S - config.NSFS_BUF_POOL_MEM_LIMIT_XS) * 0.1,
+        config.NSFS_BUF_POOL_MEM_LIMIT_S - config.NSFS_BUF_POOL_MEM_LIMIT_XS) * 0.1,
     config.NSFS_BUF_SIZE_M);
 config.NSFS_BUF_POOL_MEM_LIMIT_L = range_utils.align_down((config.BUFFERS_MEM_LIMIT -
-    config.NSFS_BUF_POOL_MEM_LIMIT_S - config.NSFS_BUF_POOL_MEM_LIMIT_XS) * 0.9,
+        config.NSFS_BUF_POOL_MEM_LIMIT_S - config.NSFS_BUF_POOL_MEM_LIMIT_XS) * 0.9,
     config.NSFS_BUF_SIZE_L);
 
 config.NSFS_BUF_WARMUP_SPARSE_FILE_READS = true;

diff --git a/src/agent/agent.js b/src/agent/agent.js
@@ -164,7 +164,6 @@ class Agent {
             'update_create_node_token',
             'update_rpc_config',
             'n2n_signal',
-            'test_store_perf',
             'test_store_validity',
             'test_network_perf',
             'test_network_perf_to_peer',
@@ -957,11 +956,6 @@ class Agent {
         return this.rpc.accept_n2n_signal(req.rpc_params);
     }
 
-    async test_store_perf(req) {
-        if (!this.block_store) return {};
-        return this.block_store.test_store_perf(req.rpc_params);
-    }
-
     async test_store_validity(req) {
         if (!this.block_store) return;
         await this.block_store.test_store_validity();

diff --git a/src/agent/block_store_services/block_store_base.js b/src/agent/block_store_services/block_store_base.js
@@ -432,6 +432,7 @@ class BlockStoreBase {
             });
             // cleanup old versions for block stores that have versioning enabled
             if (this._delete_block_past_versions) await this._delete_block_past_versions(block_md);
+            dbg.log1(`test_store_perf for node ${this.node_name}. results:`, reply);
             return reply;
         } catch (err) {
             if (err.rpc_code !== 'AUTH_FAILED' && err.rpc_code !== 'STORAGE_NOT_EXIST') {

diff --git a/src/api/agent_api.js b/src/api/agent_api.js
@@ -217,29 +217,6 @@ module.exports = {
             },
         },
 
-        test_store_perf: {
-            method: 'POST',
-            params: {
-                type: 'object',
-                properties: {
-                    count: {
-                        type: 'integer'
-                    }
-                }
-            },
-            reply: {
-                type: 'object',
-                properties: {
-                    write: {
-                        $ref: 'node_api#/definitions/latency_array'
-                    },
-                    read: {
-                        $ref: 'node_api#/definitions/latency_array'
-                    }
-                }
-            }
-        },
-
         test_store_validity: {
             method: 'POST',
         },

diff --git a/src/server/node_services/nodes_monitor.js b/src/server/node_services/nodes_monitor.js
@@ -8,7 +8,6 @@ const chance = require('chance')();
 // const dclassify = require('dclassify');
 const EventEmitter = require('events').EventEmitter;
 
-const kmeans = require('../../util/kmeans');
 const P = require('../../util/promise');
 const api = require('../../api');
 const pkg = require('../../../package.json');
@@ -714,8 +713,6 @@ class NodesMonitor extends EventEmitter {
         }
         item.node.drives = item.node.drives || [];
         item.node.latency_to_server = item.node.latency_to_server || [];
-        item.node.latency_of_disk_read = item.node.latency_of_disk_read || [];
-        item.node.latency_of_disk_write = item.node.latency_of_disk_write || [];
         item.node.storage = _.defaults(item.node.storage, {
             total: 0,
             free: 0,
@@ -845,7 +842,6 @@ class NodesMonitor extends EventEmitter {
                     .then(worker);
             };
             return P.all(_.times(concur, worker))
-                // .then(() => this._suggest_pool_assign()) // need to be rethinked - out for
                 .then(() => this._update_nodes_store('force'))
                 .catch(err => {
                     dbg.warn('_run: ERROR', err.stack || err);
@@ -1386,44 +1382,10 @@ class NodesMonitor extends EventEmitter {
         }
     }
 
-    async _test_store_perf(item) {
-        const now = Date.now();
-        if (item.last_store_perf_test && now < item.last_store_perf_test + config.STORE_PERF_TEST_INTERVAL) return;
-        try {
-
-
-            dbg.log1('running _test_store_perf::', item.node.name);
-            const res = await P.timeout(config.AGENT_RESPONSE_TIMEOUT,
-                this.client.agent.test_store_perf({
-                    count: 5
-                }, {
-                    connection: item.connection
-                })
-            );
-            item.last_store_perf_test = Date.now();
-            dbg.log0(`_test_store_perf for node ${item.node.name} returned:`, res);
-            this._set_need_update.add(item);
-            item.node.latency_of_disk_read = js_utils.array_push_keep_latest(
-                item.node.latency_of_disk_read, res.read, MAX_NUM_LATENCIES);
-            item.node.latency_of_disk_write = js_utils.array_push_keep_latest(
-                item.node.latency_of_disk_write, res.write, MAX_NUM_LATENCIES);
-        } catch (err) {
-            // ignore "unkonown" errors for cloud resources - we don't want to put the node in detention in cases where we don't know what is the problem
-            // if there is a real issue, we will take it into account in report_error_on_node_blocks
-            if (this._is_cloud_node(item) && err.rpc_code !== 'AUTH_FAILED' && err.rpc_code !== 'STORAGE_NOT_EXIST') {
-                dbg.warn(`encountered an unknown error in _test_store_perf. `, err);
-            } else {
-                dbg.log0(`encountered an error in _test_store_perf. `, err);
-                throw err;
-            }
-        }
-    }
-
     async _test_store(item) {
         if (!item.connection) return;
 
         try {
-            await this._test_store_perf(item);
             await this._test_store_validity(item);
 
             dbg.log2('_test_store:: success in test', item.node.name);
@@ -1874,8 +1836,6 @@ class NodesMonitor extends EventEmitter {
         item.io_detention = this._get_item_io_detention(item);
         item.connectivity = 'TCP';
         item.avg_ping = _.mean(item.node.latency_to_server);
-        item.avg_disk_read = _.mean(item.node.latency_of_disk_read);
-        item.avg_disk_write = _.mean(item.node.latency_of_disk_write);
         item.storage_full = this._get_item_storage_full(item);
         item.has_issues = this._get_item_has_issues(item);
         item.readable = this._get_item_readable(item);
@@ -2520,8 +2480,6 @@ class NodesMonitor extends EventEmitter {
 
         // aggregate data used by suggested pools classification
         host_item.avg_ping = _.mean(host_nodes.map(item => item.avg_ping));
-        host_item.avg_disk_read = _.mean(host_nodes.map(item => item.avg_disk_read));
-        host_item.avg_disk_write = _.mean(host_nodes.map(item => item.avg_disk_write));
 
 
         const host_aggragate = this._aggregate_nodes_list(host_nodes);
@@ -2703,126 +2661,6 @@ class NodesMonitor extends EventEmitter {
         return list.slice(skip, skip + limit);
     }
 
-    // _suggest_pool_assign() {
-    //     // prepare nodes data per pool
-    //     const pools_data_map = new Map();
-    //     for (const host_nodes of this._map_host_id.values()) {
-    //         // get the host aggregated item
-    //         const item = this._consolidate_host(host_nodes);
-    //         item.suggested_pool = ''; // reset previous suggestion
-    //         const host_id = String(item.node.host_id);
-    //         const pool_id = String(item.node.pool);
-    //         const pool = system_store.data.get_by_id(pool_id);
-    //         dbg.log3('_suggest_pool_assign: node', item.node.name, 'pool', pool && pool.name);
-    //         // skip new nodes and cloud\internal nodes
-    //         if (pool && item.node_from_store && item.node.node_type === 'BLOCK_STORE_FS') {
-    //             let pool_data = pools_data_map.get(pool_id);
-    //             if (!pool_data) {
-    //                 pool_data = {
-    //                     pool_id: pool_id,
-    //                     pool_name: pool.name,
-    //                     docs: []
-    //                 };
-    //                 pools_data_map.set(pool_id, pool_data);
-    //             }
-    //             const tokens = this._classify_node_tokens(item);
-    //             pool_data.docs.push(new dclassify.Document(host_id, tokens));
-    //         }
-    //     }
-
-    //     // take the data of all the pools and use it to train a classifier of nodes to pools
-    //     const data_set = new dclassify.DataSet();
-    //     const classifier = new dclassify.Classifier({
-    //         applyInverse: true
-    //     });
-    //     const pools_to_classify = ['default_resource', config.NEW_SYSTEM_POOL_NAME];
-    //     let num_trained_pools = 0;
-    //     for (const pool_data of pools_data_map.values()) {
-    //         // don't train by the nodes that we need to classify
-    //         if (!pools_to_classify.includes(pool_data.pool_name)) {
-    //             dbg.log3('_suggest_pool_assign: add to data set',
-    //                 pool_data.pool_name, pool_data.docs);
-    //             data_set.add(pool_data.pool_name, pool_data.docs);
-    //             num_trained_pools += 1;
-    //         }
-    //     }
-    //     if (num_trained_pools <= 0) {
-    //         dbg.log3('_suggest_pool_assign: no pools to suggest');
-    //         return;
-    //     } else if (num_trained_pools === 1) {
-    //         // the classifier requires at least two options to work
-    //         dbg.log3('_suggest_pool_assign: only one pool to suggest,',
-    //             'too small for real suggestion');
-    //         return;
-    //     }
-    //     classifier.train(data_set);
-    //     dbg.log3('_suggest_pool_assign: Trained:', classifier,
-    //         'probabilities', JSON.stringify(classifier.probabilities));
-
-    //     // for nodes in the default_resource use the classifier to suggest a pool
-    //     const system = system_store.data.systems[0];
-    //     const target_pool = system.pools_by_name[config.NEW_SYSTEM_POOL_NAME];
-    //     const target_pool_data = pools_data_map.get(String(target_pool._id));
-    //     if (target_pool_data) {
-    //         for (const doc of target_pool_data.docs) {
-    //             const host_nodes = this._map_host_id.get(doc.id);
-    //             const hostname = this._item_hostname(host_nodes[0]);
-    //             dbg.log0('_suggest_pool_assign: classify start', hostname, doc);
-    //             const res = classifier.classify(doc);
-    //             dbg.log0('_suggest_pool_assign: classify result', hostname, res);
-    //             let suggested_pool;
-    //             if (res.category !== config.NEW_SYSTEM_POOL_NAME) {
-    //                 suggested_pool = res.category;
-    //             } else if (res.secondCategory !== config.NEW_SYSTEM_POOL_NAME) {
-    //                 suggested_pool = res.secondCategory;
-    //             }
-    //             host_nodes.forEach(item => {
-    //                 item.suggested_pool = suggested_pool;
-    //             });
-
-    //         }
-
-    //     }
-    // }
-
-    _classify_node_tokens(item) {
-        // cannot use numbers as dclassify tokens only discrete strings,
-        // so we have to transform numbers to some relevant tokens
-        const tokens = [];
-        if (item.node.ip) {
-            const x = item.node.ip.split('.');
-            if (x.length === 4) {
-                tokens.push('ip:' + x[0] + '.x.x.x');
-                tokens.push('ip:' + x[0] + '.' + x[1] + '.x.x');
-                tokens.push('ip:' + x[0] + '.' + x[1] + '.' + x[2] + '.x');
-                tokens.push('ip:' + x[0] + '.' + x[1] + '.' + x[2] + '.' + x[3]);
-            }
-        }
-        if (item.node.os_info) {
-            tokens.push('platform:' + item.node.os_info.platform);
-            tokens.push('arch:' + item.node.os_info.arch);
-            tokens.push('totalmem:' + scale_size_token(item.node.os_info.totalmem));
-        }
-        if (_.isNumber(item.avg_ping)) {
-            tokens.push('avg_ping:' + scale_number_token(item.avg_ping));
-        }
-        if (_.isNumber(item.avg_disk_read)) {
-            tokens.push('avg_disk_read:' + scale_number_token(item.avg_disk_read));
-        }
-        if (_.isNumber(item.avg_disk_write)) {
-            tokens.push('avg_disk_write:' + scale_number_token(item.avg_disk_write));
-        }
-        if (item.node.storage && _.isNumber(item.node.storage.total)) {
-            const storage_other =
-                item.node.storage.total -
-                item.node.storage.used -
-                item.node.storage.free;
-            tokens.push('storage_other:' + scale_size_token(storage_other));
-            tokens.push('storage_total:' + scale_size_token(item.node.storage.total));
-        }
-        return tokens;
-    }
-
     list_nodes(query, options) {
         dbg.log2('list_nodes: query', query);
         this._throw_if_not_started_and_loaded();
@@ -3484,53 +3322,18 @@ class NodesMonitor extends EventEmitter {
             list.push(item);
         }
 
-        const latency_groups = [];
-        // Not all nodes always have the avg_disk_write.
-        // KMeans needs valid vectors so we exclude the nodes and assume that they are the slowest
-        // Since we assume them to be the slowest we will place them in the last KMeans group
-        const partition_avg_disk_write = _.partition(list, item => !Number.isNaN(item.avg_disk_write) && _.isNumber(item.avg_disk_write));
-        const nodes_with_avg_disk_write = partition_avg_disk_write[0];
-        const nodes_without_avg_disk_write = partition_avg_disk_write[1];
-        if (nodes_with_avg_disk_write.length >= config.NODE_ALLOCATOR_NUM_CLUSTERS) {
-            // TODO:
-            // Not handling noise at all.
-            // This means that we can have a group of 1 noisy drive.
-            // I rely on avg_disk_write as an average reading to handle any noise.
-            const kmeans_clusters = kmeans.run(
-                nodes_with_avg_disk_write.map(item => [item.avg_disk_write]), {
-                    k: config.NODE_ALLOCATOR_NUM_CLUSTERS
-                }
-            );
-
-            // Sort the groups by latency (centroid is the computed centralized latency for each group)
-            kmeans_clusters.sort(js_utils.sort_compare_by(item => item.centroid[0], 1));
-
-            kmeans_clusters.forEach(kmeans_cluster =>
-                latency_groups.push(kmeans_cluster.clusterInd.map(index => list[index]))
-            );
-
-            if (nodes_without_avg_disk_write.length) {
-                latency_groups[latency_groups.length - 1] =
-                    _.concat(latency_groups[latency_groups.length - 1], nodes_without_avg_disk_write);
-            }
-
-        } else {
-            latency_groups.push(list);
-        }
-
-        const lg_res = latency_groups.map(cluster => {
-            const max = 1000;
-            // This is done in order to get the most unused or free drives
-            // Since we sclice the response up to 1000 drives
-            cluster.sort(js_utils.sort_compare_by(item => item.node.storage.used, 1));
-            const nodes_set = (cluster.length < max) ? cluster : cluster.slice(0, max);
-            return {
-                nodes: nodes_set.map(item => this._get_node_info(item, params.fields))
-            };
-        });
+        if (_.isEmpty(list)) return { latency_groups: [{ nodes: [] }] };
+        const max = 1000;
+        // This is done in order to get the most unused or free drives
+        // Since we sclice the response up to 1000 drives
+        list.sort(js_utils.sort_compare_by(item => item.node.storage.used, 1));
+        const nodes_set = (list.length < max) ? list : list.slice(0, max);
+        const latency_groups = [{
+            nodes: nodes_set.map(item => this._get_node_info(item, params.fields))
+        }];
 
         return {
-            latency_groups: _.isEmpty(lg_res) ? [{ nodes: [] }] : lg_res
+            latency_groups
         };
     }
 
@@ -3666,15 +3469,6 @@ class NodesMonitor extends EventEmitter {
     }
 }
 
-function scale_number_token(num) {
-    return 2 ** Math.round(Math.log2(num));
-}
-
-function scale_size_token(size) {
-    const scaled = Math.max(scale_number_token(size), size_utils.GIGABYTE);
-    return size_utils.human_size(scaled);
-}
-
 function progress_by_time(time, now) {
     if (!time.end) return 0;
     return Math.min(1, Math.max(0,

diff --git a/src/test/unit_tests/index.js b/src/test/unit_tests/index.js
@@ -87,7 +87,7 @@ require('./test_agent_blocks_reclaimer');
 require('./test_s3_ops');
 require('./test_s3_encryption');
 require('./test_s3_bucket_policy');
-require('./test_node_allocator');
+// require('./test_node_allocator');
 require('./test_namespace_cache');
 require('./test_namespace_auth');
 require('./test_encryption');