Hi,
Here's a little algorithm I came up with tonight, well nothing fancy but I quite like it. You give it sequences (arrays), it gets used to it, and when you give it incomplete sequences, it returns the most probable complete one. As I said, nothing fancy. It will end up in BirdyVM. Here is it.
sys.getUsual = (function () {
var corpus = [];
var getExtensions = function (base, corpus) {
var result = [];
for (let c of corpus) {
var discard = false;
for (let b of base)
if (!c.includes(b)) {
discard = true;
break;
}
if (!discard) result.push(c);
}
return result;
}
// from https://stackoverflow.com/questions/11919065/sort-an-array-by-the-levenshtein-distance-with-best-performance-in-javascript
var distance = function (s, t) {
var d = [];
var n = s.length;
var m = t.length;
if (n == 0) return m;
if (m == 0) return n;
for (var i = n; i >= 0; i--) d[i] = [];
for (var i = n; i >= 0; i--) d[i][0] = i;
for (var j = m; j >= 0; j--) d[0][j] = j;
for (var i = 1; i <= n; i++) {
var s_i = s[i - 1];
for (var j = 1; j <= m; j++) {
if (i == j && d[i][j] > 4) return n;
var t_j = t[j - 1];
var cost = (s_i == t_j) ? 0 : 1;
var mi = d[i - 1][j] + 1;
var b = d[i][j - 1] + 1;
var c = d[i - 1][j - 1] + cost;
if (b < mi) mi = b;
if (c < mi) mi = c;
d[i][j] = mi;
if (i > 1 && j > 1 && s_i == t[j - 2] && s[i - 2] == t_j) {
d[i][j] = Math.min(d[i][j], d[i - 2][j - 2] + cost);
}
}
}
return d[n][m];
}
var getScores = function (base, line) {
var score = [];
for (let l = 0; l < line.length; l++) score[l] = 0;
for (let l = 0; l < line.length; l++) {
for (let word of line[l]) {
for (let l2 = 0; l2 < line.length; l2++) {
if (line[l2].includes(word)) score[l2] += 1;
}
}
}
var result = [];
for (let r = 0; r < score.length; r++) {
var ls = score[r];
var ld = distance(base, line[r]);
var ll = line[r].length;
result.push({
line: line[r],
score: [
ls / ld / ll,
ls / ld / 1,
ls / 1 / 1,
1 / ld / ll,
1 / ld / 1,
1 / 1 / ll
]
});
}
return result;
}
var getBest = function (base, result) {
for (let method = 0; method < 6; method++) {
var best = [];
for (let r of result) {
var highest = 0;
if (r.score[method] > highest) {
best = [r.line];
highest = r.score[method];
}
else if (r.score[method] === highest) best.push(r.line);
}
if (best.length === 1) return best[0];
}
return base;
}
return function (sequence, maxCorpusLength) {
var extensions = getExtensions(sequence, corpus);
var scores = getScores(sequence, extensions);
var best = getBest(sequence, scores);
while (corpus.length >= maxCorpusLength)
corpus.splice(Math.floor(Math.random()*corpus.length), 1);
corpus.push(sequence);
return best;
}
})();
I'm a bit tired tonight, but I'll try to roughly describe it tomorrow.