群岛,美国,‘瓦利斯和富图纳群岛’,‘西方 撒哈拉 </跨度> ’,‘也门’,‘赞比亚’,‘津巴布韦’];var terms,terms_esc;function viral_permutations(){ var t0,t1,i,permuted,res_elm,meta_elm锟
由于在Vue中恢复工作字符串的问题,放弃了缩小集合的概念。
现在,方法简单如下:
代码被评论。
原始javascript(记录过滤/操作的选项数组): https://jsfiddle.net/pvLj9uxe/14/
新的Vue实施: https://jsfiddle.net/15prcpxn/30/
计算似乎相当快 - DOM更新是杀死它的原因。
添加到比较*: https://jsfiddle.net/ektyx133/4/
*警告:预处理选项(视为“静态”)是策略的一部分,因此它已在基准之外进行处理。
var separator = /\s|\*|,/; // this function enhances the raw options array function enhanceOptions(options) { return options.map(option => ({ working: option.toLowerCase(), // for use in filtering the set and matching display: option // for displaying })) } // this function changes the input to lower case, splits the input into terms, removes empty strings from the array, and enhances the terms with the size and wiping string function processInput(input) { return input.trim().toLowerCase().split(separator).filter(term => term.length).map(term => ({ value: term.toLowerCase(), size: term.length, wipe: " ".repeat(term.length) })).sort((a, b) => b.size - a.size); } // this function filters the data set, then finds the match ranges, and finally returns an array with HTML tags inserted function filterAndHighlight(terms, enhancedOptions) { let options = enhancedOptions, l = terms.length; // filter the options - consider recursion instead options = options.filter(option => { let i = 0, working = option.working, term; while (i < l) { if (!~working.indexOf((term = terms[i]).value)) return false; working = working.replace(term.value, term.wipe); i++; } return true; }) // generate the display string array let displayOptions = options.map(option => { let rangeSet = [], working = option.working, display = option.display; // find the match ranges terms.forEach(term => { working = working.replace(term.value, (match, offset) => { // duplicate the wipe string replacement from the filter, but grab the offsets rangeSet.push({ start: offset, end: offset + term.size }); return term.wipe; }) }) // sort the match ranges, last to first rangeSet.sort((a, b) => b.start - a.start); // insert the html tags within the string around each match range rangeSet.forEach(range => { display = display.slice(0, range.start) + '<u>' + display.slice(range.start, range.end) + '</u>' + display.slice(range.end) }) return display; }) return displayOptions; }
https://jsfiddle.net/15prcpxn/25/
我的尝试,使用Vue进行渲染(方法是顺序的,所以你可以把它全部放在一个单片函数中而不需要太多努力 - 输入将是术语,完整选项集;输出将被过滤选项集,并突出显示范围)。
"abc ab"
"a abc"
"abc"
"a"
"bc"
start = 0, end = 1
" bc"
<u>
</u>
new option = "<u>" + "a" + "</u>" + "bc"
当存在许多匹配/不可用的术语时(例如,当您输入单个字符时),性能很差。对于最终用途,我会可能会输入计算延迟。
我应该能够将其中一些步骤汇总到更少的步骤,这可以提高性能。我明天再来。
据推测,Vue还可以通过虚拟DOM等处理一些优化,因此它不一定能反映出vanilla Javascript / DOM渲染。
这是一种完全不同于我之前的答案的方法 - 我无法将以下所有内容添加到(大小限制)中,所以......这是一个单独的答案。
一个 广义后缀树 理论上允许以有效的方式在一组字符串中搜索子字符串的结构。所以我以为我会去做。
正如可以看到的那样,以有效的方式构建这样一棵树远非微不足道 这个对Ukkonen算法的精彩解释 ,这涉及建立一个 后缀树 一个短语(选项)。
我从实施中汲取灵感 在这里找到 ,需要适应:
所以这里是:
"use strict"; // Implementation of a Generalized Suffix Tree using Ukkonen's algorithm // See also: https://stackoverflow.com/q/9452701/5459839 class Node { constructor() { this.edges = {}; this.suffixLink = null; } addEdge(ch, textId, start, end, node) { this.edges[ch] = { textId, start, end, node }; } } class Nikkonen extends Node { constructor() { super(); // root node of the tree this.texts = []; } findNode(s) { if (!s.length) return; let node = this, len, suffixSize = 0, edge; for (let i = 0; i < s.length; i += len) { edge = node.edges[s.charAt(i)]; if (!edge) return; len = Math.min(edge.end - edge.start, s.length - i); if (this.texts[edge.textId].substr(edge.start, len) !== s.substr(i, len)) return; node = edge.node; } return { edge, len }; } findAll(term, termId = 1) { const { edge, len } = this.findNode(term) || {}; if (!edge) return {}; // not found // Find all leaves const matches = new Map; (function recurse({ node, textId, start, end }, suffixLen) { suffixLen += end - start; const edges = Object.values(node.edges); if (!edges.length) { // leaf node: calculate the match if (!(matches.has(textId))) matches.set(textId, []); matches.get(textId).push({ offset: end - suffixLen, termId }); return; } edges.forEach( edge => recurse(edge, suffixLen) ); })(edge, term.length - len); return matches; } addText(text) { // Implements Nikkonen's algorithm for building the tree // Inspired by https://felix-halim.net/misc/suffix-tree/ const root = this, active = { node: root, textId: this.texts.length, start: 0, end: 0, }, texts = this.texts; // Private functions function getChar(textId, i) { return texts[textId].charAt(i) || '$' + textId; } function addEdge(fromNode, textId, start, end, node) { fromNode.addEdge(getChar(textId, start), textId, start, end, node); } function testAndSplit() { const ch = getChar(active.textId, active.end); if (active.start < active.end) { const edge = active.node.edges[getChar(active.textId, active.start)], splitPoint = edge.start + active.end - active.start; if (ch === getChar(edge.textId, splitPoint)) return; const newNode = new Node(); addEdge(active.node, edge.textId, edge.start, splitPoint, newNode); addEdge(newNode, edge.textId, splitPoint, edge.end, edge.node); return newNode; } if (!(ch in active.node.edges)) return active.node; } function canonize() { while (active.start < active.end) { const edge = active.node.edges[getChar(active.textId, active.start)]; if (edge.end - edge.start > active.end - active.start) break; active.start += edge.end - edge.start; active.node = edge.node; } } function update() { let prevNewNode = root, newNode; while (newNode = testAndSplit()) { addEdge(newNode, active.textId, active.end, text.length+1, new Node()); // Rule 2: add suffix-link from previously inserted node if (prevNewNode !== root) { prevNewNode.suffixLink = newNode; } prevNewNode = newNode; // Rule 3: follow suffixLink after split active.node = active.node.suffixLink || root; canonize(); // because active.node changed } if (prevNewNode !== root) { prevNewNode.suffixLink = active.node; } } texts.push(text); if (!root.suffixLink) root.suffixLink = new Node(); for (let i = 0; i < text.length; i++) { addEdge(root.suffixLink, active.textId, i, i+1, root); } // Main Ukkonen loop: add each character from left to right to the tree while (active.end <= text.length) { update(); active.end++; canonize(); // because active.end changed } } } function trincotSuffixTree(query, options, suffixTree, separator) { // Split query in terms at delimiter const terms = query.split(separator).filter(Boolean); if (!terms.length) return options; // Sort terms by descending size terms.sort( (a,b) => b.length - a.length ); // create Map keyed by term with count info const termMap = new Map(terms.map( (term, termId) => [term, { termId, count: 0, leftOver: 0, size: term.length }] )); terms.forEach( (term) => termMap.get(term).count++ ); function getNonOverlaps(offsets, leftOver, lastIndex = 0, offsetIndex = 0) { // All terms found? if (!leftOver) return []; let giveUpAt = Infinity; // While still enough matches left over: while (offsetIndex + leftOver <= offsets.length) { const { termId, offset } = offsets[offsetIndex++]; if (offset < lastIndex) continue; // overlap, try next if (offset >= giveUpAt) break; // Looking further makes no sense const termInfo = termMap.get(terms[termId]); //console.log('termId', termId, 'offset', offset, 'size', termInfo.size, 'lastIndex', lastIndex); if (!termInfo.leftOver) continue; // too many of the same term, try next termInfo.leftOver--; const result = getNonOverlaps(offsets, leftOver - 1, offset + termInfo.size, offsetIndex); // If success, then completely backtrack out of recursion. if (result) return result.concat([offset + termInfo.size, offset]); termInfo.leftOver++; // restore after failed recursive search and try next // If a term-match at a given offset could not lead to a solution (in recursion), // and if we keep those matched character postions all unmatched and only start matching after // the end of that location, it will certainly not lead to a solution either. giveUpAt = Math.min(giveUpAt, offset + termInfo.size); } } let allTermsAllOptionsOffsets; // Loop through the unique terms: for (let [term, termInfo] of termMap) { // Get the offsets of the matches of this term in all options (in the preprocessed tree) const thisTermAllOptionsOffsets = suffixTree.findAll(term, termInfo.termId); //console.log('findAll:', JSON.stringify(Array.from(thisTermAllOptionsOffsets))); if (!thisTermAllOptionsOffsets.size) return []; // No option has this term, so bail out if (!allTermsAllOptionsOffsets) { allTermsAllOptionsOffsets = thisTermAllOptionsOffsets; } else { // Merge with all previously found offsets for other terms (intersection) for (let [optionId, offsets] of allTermsAllOptionsOffsets) { let newOffsets = thisTermAllOptionsOffsets.get(optionId); if (!newOffsets || newOffsets.length < termInfo.count) { // this option does not have enough occurrences of this term allTermsAllOptionsOffsets.delete(optionId); } else { allTermsAllOptionsOffsets.set(optionId, offsets.concat(newOffsets)); } } if (!allTermsAllOptionsOffsets.size) return []; // No option has all terms, so bail out } } // Per option, see if (and where) the offsets can serve non-overlapping matches for each term const matches = Array.from(allTermsAllOptionsOffsets, ([optionId, offsets]) => { // Indicate how many of each term must (still) be matched: termMap.forEach( obj => obj.leftOver = obj.count ); return [optionId, getNonOverlaps(offsets.sort( (a, b) => a.offset - b.offset ), terms.length)]; }) // Remove options that could not provide non-overlapping offsets .filter( ([_, offsets]) => offsets ) // Sort the remaining options in their original order .sort( (a,b) => a[0] - b[1] ) // Replace optionId, by the corresponding text and apply mark-up at the offsets .map( ([optionId, offsets]) => { let option = options[optionId]; offsets.map((index, i) => { option = option.substr(0, index) + (i%2 ? "<u>" : "</u>") + option.substr(index); }); return option; }); //console.log(JSON.stringify(matches)); return matches; } function trincotPreprocess(options) { const nikkonen = new Nikkonen(); // Add all the options (lowercased) to the suffic tree options.map(option => option.toLowerCase()).forEach(nikkonen.addText.bind(nikkonen)); return nikkonen; } const options = ['abbbba', 'United States', 'United Kingdom', 'Afghanistan', 'Aland Islands', 'Albania', 'Algeria', 'American Samoa', 'Andorra', 'Angola', 'Anguilla', 'Antarctica', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Aruba', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bermuda', 'Bhutan', 'Bolivia, Plurinational State of', 'Bonaire, Sint Eustatius and Saba', 'Bosnia and Herzegovina', 'Botswana', 'Bouvet Island', 'Brazil', 'British Indian Ocean Territory', 'Brunei Darussalam', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cambodia', 'Cameroon', 'Canada', 'Cape Verde', 'Cayman Islands', 'Central African Republic', 'Chad', 'Chile', 'China', 'Christmas Island', 'Cocos (Keeling) Islands', 'Colombia', 'Comoros', 'Congo', 'Congo, The Democratic Republic of The', 'Cook Islands', 'Costa Rica', 'Cote D\'ivoire', 'Croatia', 'Cuba', 'Curacao', 'Cyprus', 'Czech Republic', 'Denmark', 'Djibouti', 'Dominica', 'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia', 'Ethiopia', 'Falkland Islands (Malvinas)', 'Faroe Islands', 'Fiji', 'Finland', 'France', 'French Guiana', 'French Polynesia', 'French Southern Territories', 'Gabon', 'Gambia', 'Georgia', 'Germany', 'Ghana', 'Gibraltar', 'Greece', 'Greenland', 'Grenada', 'Guadeloupe', 'Guam', 'Guatemala', 'Guernsey', 'Guinea', 'Guinea-bissau', 'Guyana', 'Haiti', 'Heard Island and Mcdonald Islands', 'Holy See (Vatican City State)', 'Honduras', 'Hong Kong', 'Hungary', 'Iceland', 'India', 'Indonesia', 'Iran, Islamic Republic of', 'Iraq', 'Ireland', 'Isle of Man', 'Israel', 'Italy', 'Jamaica', 'Japan', 'Jersey', 'Jordan', 'Kazakhstan', 'Kenya', 'Kiribati', 'Korea, Democratic People\'s Republic of', 'Korea, Republic of', 'Kuwait', 'Kyrgyzstan', 'Lao People\'s Democratic Republic', 'Latvia', 'Lebanon', 'Lesotho', 'Liberia', 'Libya', 'Liechtenstein', 'Lithuania', 'Luxembourg', 'Macao', 'Macedonia, The Former Yugoslav Republic of', 'Madagascar', 'Malawi', 'Malaysia', 'Maldives', 'Mali', 'Malta', 'Marshall Islands', 'Martinique', 'Mauritania', 'Mauritius', 'Mayotte', 'Mexico', 'Micronesia, Federated States of', 'Moldova, Republic of', 'Monaco', 'Mongolia', 'Montenegro', 'Montserrat', 'Morocco', 'Mozambique', 'Myanmar', 'Namibia', 'Nauru', 'Nepal', 'Netherlands', 'New Caledonia', 'New Zealand', 'Nicaragua', 'Niger', 'Nigeria', 'Niue', 'Norfolk Island', 'Northern Mariana Islands', 'Norway', 'Oman', 'Pakistan', 'Palau', 'Palestinian Territory, Occupied', 'Panama', 'Papua New Guinea', 'Paraguay', 'Peru', 'Philippines', 'Pitcairn', 'Poland', 'Portugal', 'Puerto Rico', 'Qatar', 'Reunion', 'Romania', 'Russian Federation', 'Rwanda', 'Saint Barthelemy', 'Saint Helena, Ascension and Tristan da Cunha', 'Saint Kitts and Nevis', 'Saint Lucia', 'Saint Martin (French part)', 'Saint Pierre and Miquelon', 'Saint Vincent and The Grenadines', 'Samoa', 'San Marino', 'Sao Tome and Principe', 'Saudi Arabia', 'Senegal', 'Serbia', 'Seychelles', 'Sierra Leone', 'Singapore', 'Sint Maarten (Dutch part)', 'Slovakia', 'Slovenia', 'Solomon Islands', 'Somalia', 'South Africa', 'South Georgia and The South Sandwich Islands', 'South Sudan', 'Spain', 'Sri Lanka', 'Sudan', 'Suriname', 'Svalbard and Jan Mayen', 'Swaziland', 'Sweden', 'Switzerland', 'Syrian Arab Republic', 'Taiwan, Province of China', 'Tajikistan', 'Tanzania, United Republic of', 'Thailand', 'Timor-leste', 'Togo', 'Tokelau', 'Tonga', 'Trinidad and Tobago', 'Tunisia', 'Turkey', 'Turkmenistan', 'Turks and Caicos Islands', 'Tuvalu', 'Uganda', 'Ukraine', 'United Arab Emirates', 'United Kingdom', 'United States', 'United States Minor Outlying Islands', 'Uruguay', 'Uzbekistan', 'Vanuatu', 'Venezuela, Bolivarian Republic of', 'Viet Nam', 'Virgin Islands, British', 'Virgin Islands, U.S.', 'Wallis and Futuna', 'Western Sahara', 'Yemen', 'Zambia', 'Zimbabwe']; /* * I/O and performance measurements */ let preprocessed; function processInput() { if (!preprocessed) { // Only first time const t0 = performance.now(); preprocessed = trincotPreprocess(options); const spentTime = performance.now() - t0; // Output the time spent on preprocessing pretime.textContent = spentTime.toFixed(2); } var query = this.value.toLowerCase(); const t0 = performance.now(); const matches = trincotSuffixTree(query, options, preprocessed, ' '); const spentTime = performance.now() - t0; // Output the time spent time.textContent = spentTime.toFixed(2); // Output the matches result.innerHTML = ''; for (var match of matches) { // Append it to the result list var li = document.createElement('li'); li.innerHTML = match; result.appendChild(li); } } findTerms.addEventListener('keyup', processInput); processInput.call(findTerms);
ul { height:300px; font-size: smaller; overflow: auto; }
Input terms: <input type="text" id="findTerms"><br> <h3>Trincot's Suffix Tree Search</h3> Preprocessing Time: <span id="pretime"></span>ms (only done once)<br> Time: <span id="time"></span>ms<br> <ul id="result"></ul>
这个方法背后有相当多的代码,所以我认为它可能不会显示小数据集的有趣性能,而对于较大的数据集,它将消耗内存:树占用的内存比原始选项数组多得多。