Note: After saving, you have to bypass your browser's cache to see the changes. Google Chrome, Firefox, Microsoft Edge and Safari: Hold down the ⇧ Shift key and click the Reload toolbar button. For details and instructions about other browsers, see Wikipedia:Bypass your cache.
// This script takes kanji with ruby text over it and removes repeated parts
// It's called automatically by showKanji.js if any furigana was added

// The basic algorithm searches for *continuous* hiragana/katakana/latin/punctuation
// strings that are in both the base and reading, and splits on these. This does
// not take into account any lexical information (so it doesn't know anything about
// particles or individual kanji readings). It can also fail for more complicated
// cases, but the script should be able to abort for these (maybe in the future we can
// continue and just ignore that specific base and substring).

// License: CC0

function getKanjiInfo() {
	// Don't run if the kanji or the ruby is hidden
	if ($("#kanjiInfo").css("display") == "none" || $("#kanjiInfo rt").css("display") == "none") {
		return;
	}

    var kanji = $("#kanjiInfo ruby")[0].childNodes[0].nodeValue;
    var kana = $("#kanjiInfo rt").text();

    if (!kanji || !kana) {
    	return;
    }

    var bases = [kanji];
    var readings = [kana];

    bindKana(bases, readings);

    // If any binding occured
    if (bases.length > 1) {
        displayBoundKana(bases, readings);
    }
}

function bindKana(bases, readings) {
	var iterations = 0;
    var maxIterations = 25;
    var foundBindings = true;
    while (foundBindings && iterations != maxIterations) {
        iterations++;
        foundBindings = tryBind(bases, readings);
    }

    // Sanity check
    if (bases.length != readings.length) {
        throw new Error("bindKana.js: Bases and readings arrays don't have same lengths.");
    }
    
    // Check kanji:kana ratio
    for (var i = 0; i < bases.length; i++) {
    	var kanjiLength = bases[i].length;
    	var kanaLength = readings[i].length;
    	if (kanjiLength === 0 || kanaLength === 0) { continue; }
    	
    	var ratio = kanaLength / kanjiLength;
    	if (ratio >= 6 || ratio <= 1/6) {
    		throw new Error("bindKana.js: kanji:kana ratio greater than 6 for `"
    		                + bases[i] + "` and `" + readings[i] + "`.");
    	}
    }
    
    if (iterations == maxIterations - 1) {
        console.warn("bindKana.js: Encountered maximum iterations.");
        
        if (bases.length == 1) {
        	throw new Error("bindKana.js: Encountered maximum iterations while furigana wasn't split once.");
        }
    }
}

function tryBind(bases, readings) {
	var regexes = [kanaRegexes.katakanaRe, kanaRegexes.alphanumRe,
                   kanaRegexes.hiraganaRe, kanaRegexes.miscRe];
    var baseLength = bases.length;
    for (var i = 0; i < baseLength; i++) {
        if (readings[i] === "") {
            continue;
        }

        for (var regex of regexes) {
            searchBase(bases, readings, i, regex);

            if (bases.length != baseLength) {
                break;
            }
        }
    }

    if (bases.length != baseLength) {
    	// Make sure splitting didn't mess up the bindings
        for (var j = 0; j < bases.length; j++) {
         if (kanaRegexes.kanjiRe.test(bases[j]) && readings[j] === "") {
                throw new Error("bindKana.js: Kanji base with no reading: `"
                                + bases[j] + "` at index " + j);
            } else if (bases[j]  === "" && readings[j]) {
                throw new Error("bindKana.js: Blank base with reading: `"
                                + readings[j] + "` at index " + j);
            }
        }

        return true;
    } else {
        return false;
    }
}

function searchBase(bases, readings, index, re) {
	var baseLength = bases.length;
	var substring = bases[index].match(re);
    if (substring) {
        for (var j = 0; j < substring.length; j++) {
        	// Handle case where the furigana is just a hiragana version of the katakana
        	// Only works if whole thing is split along the reading
        	if (re == kanaRegexes.katakanaRe && /^[ァ-ヴ]+$/.test(bases[index])
        	    && bases[index] == readings[index].hiraganaToKatakana()) {
        		readings[index] = readings[index].hiraganaToKatakana();
        	}

            // Misc stuff like whitespace should be split searching forward
            if (re !== kanaRegexes.miscRe) {
            	splitFuriganaReverse(bases, readings, index, substring[j]);
            } else {
            	splitFuriganaForward(bases, readings, index, substring[j]);
            }

            // Check if we split on the substring
            if (bases.length != baseLength) {
            	// Splitting should result in [l|match|r] w/ ruby of [l|""|r]
            	if (bases.length != baseLength + 2) {
            		throw new Error("bindKana.js: Splitting added more than two new parts.");
            	}

                return;
            }
        }
    }
}

String.prototype.hiraganaToKatakana = function() {
    return this.replace(/[\u3041-\u3096]/g, function(s) {return String.fromCharCode(s.charCodeAt(0) + 0x0060)});
};

// We search for everything reversed because particles are suffixes
function splitFuriganaReverse(bases, readings, index, substring) {
	var baseReversed = reverseString(bases[index]);
	var readingReversed = reverseString(readings[index]);
	var substringReversed = reverseString(substring);

    var substringEscaped = mw.util.escapeRegExp(substringReversed);
    var substringRe = new RegExp(substringEscaped);
    // We match everything to left of substring, substring, and then right side
    var substringSearch = new RegExp("(.*?)(" + substringEscaped + ")(.*)");

    // First make sure substring is in both the base and its reading
    if (substringRe.test(baseReversed) && substringRe.test(readingReversed)) {
        // Insert substring into base
        var baseSearch = baseReversed.match(substringSearch);
        // AaBbCc -> "cC" | "bB" | "aA"
        var baseLeftSide = reverseString(baseSearch[3]);
        var baseRightSide = reverseString(baseSearch[1]);
        // Start at index, delete one element, and then insert the other parameters
        bases.splice(index, 1, baseLeftSide, substring, baseRightSide);
        
        var readingSearch = readingReversed.match(substringSearch);
        var readingLeftSide = reverseString(readingSearch[3]);
        var readingRightSide = reverseString(readingSearch[1]);
        readings.splice(index, 1, readingLeftSide, "", readingRightSide);
    }
}

function reverseString(str) {
    return str.split("").reverse().join("");
}

// TODO: Generalize this with reverse somehow
function splitFuriganaForward(bases, readings, index, substring) {
    var substringEscaped = mw.util.escapeRegExp(substring);
    var substringRe = new RegExp(substringEscaped);
    var substringSearch = new RegExp("(.*?)(" + substringEscaped + ")(.*)");

    if (substringRe.test(bases[index]) && substringRe.test(readings[index])) {
        var baseSearch = bases[index].match(substringSearch);
        var baseLeftSide = baseSearch[1];
        var baseRightSide = baseSearch[3];
        // Start at index, delete one element, and then insert the other parameters
        bases.splice(index, 1, baseLeftSide, substring, baseRightSide);
        
        var readingSearch = readings[index].match(substringSearch);
        var readingLeftSide = readingSearch[1];
        var readingRightSide = readingSearch[3];
        readings.splice(index, 1, readingLeftSide, "", readingRightSide);
    }
}

function displayBoundKana(bases, readings) {
    $("#kanjiInfo ruby").addClass("unbound");
    $(".unbound").css("display", "none");
    var fromWikidata = false;

    // Build new ruby element from the two bases and readings arrays
    var newKana = "<ruby class='bound'>";
    for (var i = 0; i < bases.length; i++) {
        newKana += "<rb>" + bases[i] + "</rb>";
        newKana += "<rt>" + readings[i] + "</rt>";
    }
    newKana += "</ruby>";

    $("#kanjiInfo").append(newKana);
    prettifyEnds();
}

function prettifyEnds() {
	// Exclude misc characters from base; for nicer formatting
    $("#kanjiInfo rb").each(function(){
        var baseText = $(this).text();

        // Rm empty ruby base and readings
        if (baseText === "") {
        	$(this).next().remove();
        	$(this).remove();
        	return;
        } else if (baseText === " ") {
        	return;
        }

        var start = baseText[0];
        kanaRegexes.miscRe.lastIndex = 0; // reset regex
        if (kanaRegexes.miscRe.test(start)) {
            var startRemainder = baseText.slice(1);
            $(this).text(startRemainder);
            $(this).before("<rb>" + start + "</rb><rt></rt>");
        }

        baseText = $(this).text();
        kanaRegexes.miscRe.lastIndex = 0;
        var end = baseText.slice(-1);
        if (kanaRegexes.miscRe.test(end)) {
            var len = baseText.length;
            var endRemainder = baseText.slice(0, len-1);
            $(this).text(endRemainder);
            $(this).next().after("<rb>" + end + "</rb><rt></rt>");
        }
    }); 
}

var kanaRegexes = {
    kanjiRe: /[\u3400-\u4DB5\u4E00-\u9FCB\uF900-\uFA6A]/,
    // kanjiRe: /[一-龯]+/g,
    hiraganaRe: /[ぁ-ゔ]+/g,
    katakanaRe: /[ァ-ヴー]+/g,
    alphanumRe: /[A-Za-z0-9]+/g,
    miscRe: /[- !.?・、「」×〜&/]/g
}

getKanjiInfo();