/*
* wikiParserV.js
* ver. 2013-11-02
* Home: http://en.wiki.x.io/wiki/User:V111P/js/wikiParserV
*
* This is a library of useful functions, mostly for working with wiki code.
* Includes functions for removing html tags.
*
* You can use the code in this script under the
* Creative Commons Attribution 3.0 Unported License (CC-BY 3.0)
* http://creativecommons.org/licenses/by/3.0/
* If you do use it, please let me know. Thanks.
*/
mediaWiki.libs.wikiParserV = window.wikiParser = (function () {
"use strict";
var version = 1000;
var re = {
escForRegExpG: /[.*+?^$|()[\]{\\^$]/g,
testRe: /<(?!\/?(a|b)>)/g,
nonAlphanumericAndHyphenCharsG: /[^A-Za-z0-9_-]/g,
htmlCommentsG: /(\n)?<!--[\S\s]*?-->\1?/g // replace it with $1
};
var locale = {}; // used in removeElements()
var $tempDiv = $('<div/>'); // used in unescapeCharEntities()
var wgScriptPath;
var sectionNameUriEncodingAdditionalReplacements;
function unescapeCharEntities(str) {
return $tempDiv.html(str.replace('<', '<').replace('>', '>')).text();
}
function formatUrl(article, noredir, edit) {
wgScriptPath = mw.config.get('wgScriptPath');
article = article.replace(/ /g, '_');
var pagePlusHash = article.match(/(.+)#(.+)/);
if (pagePlusHash)
article = encodeURIComponent(pagePlusHash[1]) + '#'
+ encodeURIComponent(pagePlusHash[2]).replace(/%/g, '.');
if (noredir)
return wgScriptPath + '/index.php?title=' + article + '&redirect=no';
else if (edit)
return wgScriptPath + '/index.php?title='
+ article.replace(/#.*/, '') + '&action=edit';
else
return '/wiki/' + article;
} // formatUrl
function encodeSectionNameForUrl(str) {
var res = sectionNameUriEncodingAdditionalReplacements
|| (sectionNameUriEncodingAdditionalReplacements = [
{re: /~/g, newVal: '.7E'},
{re: /!/g, newVal: '.21'},
{re: /\*/g, newVal: '.2A'},
{re: /\(/g, newVal: '.28'},
{re: /\)/g, newVal: '.29'},
{re: /\'/g, newVal: '.27'},
{re:/%3A/g, newVal: ':'}
]);
var str = encodeURIComponent(str.replace(/ /g, '_'));
$.each(res, function (i, val) {
str = str.replace(val.re, val.newVal);
});
return str.replace(/%/g, '.');
} // encodeSectionNameForUrl
function encodeSectionNameForId(str) {
str = encodeSectionNameForUrl(str.replace(/\./g, '_46'))
.replace(/:/, '_3A')
.replace(re.nonAlphanumericAndHyphenCharsG, '_');
return str;
} // encodeSectionNameForId
function escapeForRegExp(str) {
return str.replace(re.escForRegExpG, '\\$&');
} // escapeForRegExp
// pretreat for embeded elements with the same closing tag
function removeElRegExp(startTag, endTag, startTagOfEmbededEl) {
var res = {pretreat: null, main: null};
var startTagEsc = escapeForRegExp(startTag)
.replace(/<<</g, '(').replace(/@@@/g, '|').replace(/>>>/g, ')');
var endTagEsc = escapeForRegExp(endTag);
if (startTagOfEmbededEl) {
var startTagOfEmbededElEsc = escapeForRegExp(startTagOfEmbededEl);
res.pretreat = new RegExp('(' + startTagEsc + '(?:(?!' + endTagEsc + ')[\\S\\s])*?)'
+ startTagOfEmbededElEsc + '(?:(?!' + startTagOfEmbededElEsc + ')[\\S\\s])*?'
+ endTagEsc, 'gi');
}
res.main = new RegExp('(\\n)?' + startTagEsc + '((?!' + startTagEsc + '|' + endTagEsc + ')[\\S\\s])*'
+ endTagEsc + '\\1?', 'gi');
return res;
} // removeElRegExp
// startTagOfEmbededEl - needed because for example files and wiki links have the same
// closing tags, so to remove files, pass '[[File:' as startTag and '[[' as startTagOfEmbededEl
function removeElRegExpStartArr(startTagPre, startTagArr, startTagPost,
endTag, startTagOfEmbededEl) {
var st = startTagPre + '<<<' + startTagArr.join('@@@') + '>>>' + startTagPost;
return removeElRegExp(st, endTag, startTagOfEmbededEl);
} // removeElRegExpStartArr
function removeEls(data, res, iterationLimit) {
var prev, cntr;
iterationLimit = iterationLimit || 1000;
if (res.pretreat) {
cntr = iterationLimit;
do {
cntr--; // anti infinite-loop var just in case...
prev = data;
data = data.replace(res.pretreat, '$1');
} while (data != prev && cntr > 0);
}
cntr = iterationLimit;
do {
cntr--;
prev = data;
data = data.replace(res.main, '$1');
} while (data != prev && cntr > 0);
return data;
} // removeEls
// saves all versions of some namespace names
function saveNsNames() {
locale.specialNsArr = [];
locale.fileNsArr = [];
locale.categoryNsArr = [];
$.each(mw.config.get('wgNamespaceIds'), function (key, val) {
if (val == '-1') { // 'special'
if ($.inArray(key, locale.specialNsArr) == -1)
locale.specialNsArr.push(key);
}
else if (val == '6' || val == '-2') { // 'file'/'image' or 'media'
if ($.inArray(key, locale.fileNsArr) == -1)
locale.fileNsArr.push(key);
}
else if (val == '14') { // 'category'
if ($.inArray(key, locale.categoryNsArr) == -1)
locale.categoryNsArr.push(key);
}
});
} // saveNsNames
// won't work in all cases
function escCharsForNowikiTags(data) {
var nowikiCharTranslMap = {
'[': '[', ']': ']', '{': '{', '}': '}',
'<': '<', '>': '>', ':': ':', '*': '*', '#': '#'
};
//en.wiki.x.io/wiki/Help:Nowiki#WP:NOWIKI
var singleCharEscReG = re.singleCharEscG
|| (re.singleCharEscG = /(.|^)(?:nowiki ?\/|nowiki><\/nowiki)>(.)/g);
data = data.replace(singleCharEscReG, function (m, $1, $2) {
if ($1 == '<') return '<' + $2;
else if (nowikiCharTranslMap[$2]) return $1 + nowikiCharTranslMap[$2];
else if (nowikiCharTranslMap[$1]) return nowikiCharTranslMap[$1] + $2;
});
var noWikiElReG = re.noWikiElG || (re.noWikiElG = /<(nowiki|pre)>([\S\s]*?)<\/\1>/g);
var noWikiReplaceCharsReG = re.noWikiReplG || (re.noWikiReplG = /\[|]|\{|}|<|>|:|\*|#/g);
data = data.replace(noWikiElReG, function (match, $1, $2) {
return $2.replace(noWikiReplaceCharsReG, function (match) {
return nowikiCharTranslMap[$2];
})});
return data;
} // escCharsForNowikiTags
function removeElements(data, elStr) {
var arr = elStr.split(', ');
if ($.inArray('comments', arr) > -1)
data = data.replace(re.htmlCommentsG, '$1');
if ($.inArray('tables', arr) > -1) {
data = removeEls(data, re.wikiTable
|| (re.wikiTable = removeElRegExp('{|', '|}')));
data = removeEls(data, re.htmlTable
|| (re.htmlTable = removeElRegExp('<table', '</table>')));
}
if ($.inArray('templates', arr) > -1)
data = removeEls(data, re.templates
|| (re.templates = removeElRegExp('{{', '}}') ));
if ($.inArray('references', arr) > -1)
data = data.replace(re.refs
|| (re.refs = /<ref[^>]*?(\/>|>[\S\s]*?<\/ref\s*>)/ig), '');
if ($.inArray('files', arr) > -1) {
if (!locale.fileNsArr)
saveNsNames();
data = removeEls(data, re.files
|| (re.files = removeElRegExpStartArr('[[', locale.fileNsArr, ':', ']]', '[[')));
data = data.replace(re.gallery
|| (re.gallery = /(\n)?<gallery[^>]*>[\S\s]*?<\/gallery>\1?/gi), '$1');
}
if ($.inArray('categories', arr) > -1) {
if (!locale.categoryNsArr)
saveNsNames();
data = removeEls(data, re.category
|| (re.category = removeElRegExpStartArr('[[', locale.categoryNsArr, ':', ']]')));
}
if ($.inArray('bold/italic', arr) > -1) {
data = data.replace(re.boldItalicG
|| (re.boldItalicG = /<\/?(i|b|strong|em)>|'''?|('){2,3}/gi), '');
}
if ($.inArray('behavior switches', arr) > -1) {
data = data.replace(re.behaviorSwitchesG
|| (re.behaviorSwitchesG = /(\n)?__[^\s]+?__\1?/g), '$1');
}
if ($.inArray('others', arr) > -1) {
data = data.replace(re.timelineG
|| (re.timelineG = /(\n)?<timeline>[\S\s]*?<\/timeline>\1?/gi), '$1');
}
return data;
} // removeElements;
// all files ([[File:...]]) must be removed BEFORE calling this function
function unlink(data) {
// remove all wikilinks and files
var prev, cntr = 1000;
var remAddrReG = re.remAddrG || (re.remAddr = /\[\[[^|\]]*\|/g);
var unlinkLinksReG = re.unlinkLinksReG || (re.unlinkLinksReG = /\[\[([^\]\[]+)\]\]/g);
do {
cntr--;
prev = data;
// remove addresses from all links:
data = data.replace(remAddrReG, '[[');
} while (data != prev && cntr > 0);
// unlink all links:
data = data.replace(unlinkLinksReG, '$1');
return data;
} // unlink
function boldAndItalicToHtml(data) {
if (!re.boldAndItalicToHtml1) {
// the first regex removes four, six, or more apostrophes
re.boldAndItalicToHtml1 = /(^|[^'])''''('{2,})?([^']|$)/g;
re.boldAndItalicToHtml2 = /'''([^'\n][^\n]*?)('''|\n)/g;
re.boldAndItalicToHtml3 = /''([^\n]+?)(''|\n)/g;
}
return data.replace(re.boldAndItalicToHtml1, '')
.replace(re.boldAndItalicToHtml2, '<b>$1</b>')
.replace(re.boldAndItalicToHtml3, '<i>$1</i>');
} // boldAndItalicToHtml
function beforeTheFirstSection(data, removeCategories) {
var tempArr;
// keep only the text before the start of the first section title
// (section titles starts with = on a new line).
// If there are no sections, remove the categories
var beforeFirstSectRe = re.beforeFirstSect
|| (re.beforeFirstSect = /^([\S\s]*?)(?=(\n(=+).+?\3[^\S\n]*)(\n|$))/);
var newData = (tempArr = beforeFirstSectRe.exec(data)) && tempArr[1];
return newData || (removeCategories ? removeElements(data, 'categories') : data);
} // beforeTheFirstSection
function divideSections(data) {
var sections = [];
sections.push({
eq: '',
level: 0,
heading: '',
contents: beforeTheFirstSection(data, false)
});
var match;
var regex = re.divSectionsG ||
(re.divSectionsG = /(^|\n)(=+)(.+?)\2[^\S\n]*(?=\n)([\S\s]*?)(?=\n(=+).+?\5[^\S\n]*(?:\n|$)|$)/g);
var cntr = 1000;
while ((match = regex.exec(data)) && cntr > 0) {
cntr--;
sections.push({
eq: match[2],
level: match[2].length,
heading: $.trim(match[3]),
contents: $.trim(match[4])
});
}
return sections;
} // divideSections
function checkRegexSupport() {
return ('<a><bd</e></b>'.replace(re.testRe, '<') == '<a><bd</e></b>');
}
// removes html tags and some whole elements, except
// for the tags in the comma+space-separated whiteListTagsStr list
// Removes all the attributes from the white-listed tags tags.
// Converts < before a whitespace character into <
function sanitizeHtml(data, whiteListTagsStr, leaveSpecialChars) {
if (!checkRegexSupport())
throw 1; // no (lookahead) regex support
var whiteList = (whiteListTagsStr || '').split(', ').join('|');
var commentReG = re.htmlCommentG || (re.htmlCommentG = /<!--[\S\s]*?-->/g);
var nonWhiteListedTagsReG, allTagsG;
var lessThanNotBeforeWLTagG;
var grThanNotAndAfterWLTagG;
var tagAttributesReG;
var oldData, cntr;
if (whiteList !== '') {
var byAll = re.resByWhitelist = (re.resByWhitelist || {});
var by = byAll[whiteListTagsStr] || (byAll[whiteListTagsStr] = {});
nonWhiteListedTagsReG = by.nonWhiteListedTagsG
|| (by.nonWhiteListedTagsG = new RegExp('<(?!/?(' + whiteList + ')(\\b|/))[^>]*>', 'gi'));
lessThanNotBeforeWLTagG = by.lessThanNotBeforeWLTagG
|| (by.lessThanNotBeforeWLTagG = new RegExp('<(?!/?(' + whiteList + ')/?>)', 'gi'));
grThanNotAndAfterWLTagG = by.grThanNotAndAfterWLTagG
|| (by.grThanNotAndAfterWLTagG = new RegExp('(</?(' + whiteList + ')/?)?>', 'gi'));
tagAttributesReG = re.tagAttributesG
|| (re.tagAttributesG = /<(\/?[a-z][a-z0-9]*)[^>]*?(\/)?>/gi);
}
else
allTagsG = re.allTagsG || (re.allTagsG = /<(\b|\/)[^>]*>/g);
cntr = 1000;
do {
oldData = data;
cntr--;
// remove comments:
data = data.replace(re.htmlCommentsG, '$1');
// remove all tags except the white-listed ones
if (whiteList !== '') {
data = data.replace(nonWhiteListedTagsReG, '');
// remove all attributes from the remaining tags:
data = data.replace(tagAttributesReG, '<$1$2>');
}
else
data = data.replace(allTagsG, '');
} while (oldData != data && cntr > 0);
if (cntr <= 0) throw 2;
if (!leaveSpecialChars) {
var ampNotInCharRefReG = re.ampReG || (re.ampReG = /&(?!#?[xX]?[a-zA-Z0-9]+;)/g);
var ltReG = /</g;
var gtReG = />/g;
var quoteReG = /"/g;
var aposReG = /'/g;
var graveReG = /`/g;
cntr = 1000;
do {
oldData = data;
cntr--;
if (whiteList !== '') {
// html-escape all < and > except if part of a whitelisted tag
data = data.replace(lessThanNotBeforeWLTagG, '<');
data = data.replace(grThanNotAndAfterWLTagG, function ($0, $1) {
return $1 ? $0 : '>';
});
}
else { // html-escape all < and > chars
data = data.replace(ltReG, '<').replace(gtReG, '>');
}
// escape & to & if obviously not a part of a char ref:
data = data.replace(ampNotInCharRefReG, '&');
// escape all quotes (` is used in old IE)
data = data.replace(quoteReG, '"').replace(aposReG, ''')
.replace(graveReG, '`');
} while (oldData != data && cntr > 0);
if (cntr <= 0) throw 2;
}
return data;
} // sanitizeHtml
function focusedSegment(bsa, segmentNames) {
segmentNames = (typeof segmentNames == 'object') ? segmentNames : segmentNames.split(', ');
for (var i = 0; i < segmentNames.length; i++) {
if (segmentNames[i] == 'wikilink')
return focusedCustomSegment(bsa, '[[', ']]', '', '[]<>{}');
}
}
// bsa - an array with 3 elements: [text_before_the_selection/cursor, selection, text_after]
// the other arguments - the char(s) indicating the start/end of the segment
// otherStartChars (optional) - start chars of other segments with the same endChars,
// needed only for some elements, for example if startChars is [[File:,
// otherStartChars needs to be [[ because links can be embeded in file elements.
// invalidBeforePipe - a string with individual illegal characters. Illigal only if before
// the first pipe character "|" (or anywhere, if there is no pipe character).
function focusedCustomSegment(bsa, startChars, endChars, otherStartChars, invalidBeforePipe) {
function endMatches(str, endChars) {
return (str.slice(-endChars.length) === endChars);
}
function startMatches(str, startChars) {
return (str.slice(0, startChars.length) === startChars);
}
var before = bsa[0];
var selection = bsa[1]; // the selection
var after = bsa[2];
var spaces;
if (!startChars || !endChars)
return;
if (selection) { // there is some selected text
spaces = selection.match(/^\s+/);
if (spaces) { // spaces at the beginning of the selected text
if (endMatches(before, startChars)) {
selection = startChars + selection;
before = before.slice(0, -startChars.length);
}
else {
// move the spaces to the end of the text-before-the-selection:
before += spaces[0];
selection = selection.slice(spaces[0].length);
// check for startChars at beginning of selection:
if (!startMatches(selection, startChars))
return;
}
}
else {
// while no (complete) startChars string at beginning of selection:
// move a char from the end of textBefore to the beginning of selection
var startCharsFound = false;
for (i = 0; i <= startChars.length; i++) {
if (startMatches(selection, startChars)) {
startCharsFound = true;
break;
}
if (before.length == 0)
break;
selection = before.slice(before.length - 1) + selection;
before = before.slice(0, before.length - 1);
}
if (!startCharsFound)
return;
// TODO: check if selection contains only one outer element,
// and the start-end chars are ballanced
}
spaces = selection.match(/\s+$/);
if (spaces) { // spaces at the end of the selected text
if (startMatches(after, endChars)) {
selection = selection + endChars;
after = after.slice(endChars.length);
}
else {
// move spaced to the beginning of the text-after-the-selection:
after = spaces[0] + after;
selection = selection.slice(0, -spaces[0].length);
if (!endMatches(selection, endChars))
return;
}
}
else {
// while no (complete) endChars string found at end of selection:
// move a char from the beginning of textBefore to the end of selection
var endCharsFound = false;
for (i = 0; i <= endChars.length; i++) {
if (endMatches(selection, endChars)) {
endCharsFound = true;
break;
}
if (after.length == 0)
break;
selection = selection + after.charAt(0);
after = after.slice(1);
}
if (!endCharsFound)
return;
}
} // if (selection)
else { // no text selected
var text = before + after;
// TODO: add a loop to allow the cursor to be after an embeded element
var startCharsAt = text.lastIndexOf(startChars, before.length + startChars.length - 3);
if (startCharsAt == -1)
return;
var closing = startCharsAt;
var opening = startCharsAt;
var openingOther;
var i = 0;
while (i++ < 10) {
closing = text.indexOf(endChars, closing + 1);
if (closing == -1) {
return;
}
if (otherStartChars) {
openingOther = text.indexOf(otherStartChars, opening);
}
opening = text.indexOf(startChars, opening + 1);
if (opening == -1)
opening = text.length;
if (otherStartChars) {
if (openingOther > -1)
opening = (openingOther < opening ? openingOther : opening);
}
if (closing < opening) {
if (closing < before.length - endChars.length) {
return;
}
selection = text.slice(startCharsAt, closing + startChars.length);
before = text.slice(0, startCharsAt);
after = text.slice(closing + startChars.length);
break;
}
}
}
if (invalidBeforePipe) {
var invalidEscForRe = escapeForRegExp(invalidBeforePipe);
var beforePipe = selection.slice(startChars.length, -endChars.length).match(/[^|]*/)[0];
if (beforePipe.match('[' + invalidEscForRe + ']'))
return;
}
return [before, selection, after];
} // focusedSegment
return {
version: version,
unescapeCharEntities: unescapeCharEntities,
formatUrl: formatUrl,
encodeSectionNameForUrl: encodeSectionNameForUrl,
encodeSectionNameForId: encodeSectionNameForId,
checkRegexSupport: checkRegexSupport,
escCharsForNowikiTags: escCharsForNowikiTags,
removeElRegExp: removeElRegExp,
removeElRegExpStartArr: removeElRegExpStartArr,
removeElements: removeElements,
unlink: unlink,
sanitizeHtml: sanitizeHtml,
boldAndItalicToHtml: boldAndItalicToHtml,
beforeTheFirstSection: beforeTheFirstSection,
divideSections: divideSections,
focusedCustomSegment: focusedCustomSegment, // incomplete implementation
focusedSegment: focusedSegment // works only for wikilinks right now
};
})();