/*==============================================================================

                             HTML2XHTML Converter 1.5
                             ========================
                       Copyright (c) 2004-2006 Vyacheslav Smolin


Author:
-------
Vyacheslav Smolin (http://www.richarea.com, http://html2xhtml.richarea.com,
re@richarea.com)

About the script:
-----------------
HTML2XHTML Converter (H2X) generates a well formed XHTML string from a HTML DOM
object.

Requirements:
-------------
H2X works in  MS IE 5.0 for Windows or above,  in Netscape 7.1,  Mozilla 1.3 or
above. It should work in all Mozilla based browsers.

Usage:
------
Please see description of function get_xhtml below.

Demo:
-----
http://html2xhtml.richarea.com/, http://www.richarea.com/demo/

License:
--------
Free for non-commercial using. Please contact author for commercial licenses.


==============================================================================*/

/*
====================================================================================================================
#	Date			Developer	Brief Description of Change and Reason for Change
====================================================================================================================
1.	14-Mar-2007		pinhow		For text inside "script" tag, change "&", "<", ">" to HTML code.
								So that it becomes XML text only.
2.	16-Mar-2007		pinhow		Fixed bug. If the tag name starts with "/", skip it to prevent error in IE.
								E.g. In IE, any "</a>" will be converted to "</a />" which is wrong.
3.	15-Jun-2007		pinhow		Fixed bug. Don't use RegExp.compile, Safari browser does not support. Assign 
								directly in its constructor.
4.	19-Jul-2007		pinhow		Changed fix_entities(text) function: Convert HTML code into HTML number instead of
								the reverse way. E.g. Convert "&pound;" to "&#163;", convert "&euro;" to "&#8364;".
===================================================================================================================
*/


//add \n before opening tag
var need_nl_before = '|div|p|table|tbody|tr|td|th|title|head|body|script|comment|li|meta|h1|h2|h3|h4|h5|h6|hr|ul|ol|option|link|';
//add \n after opening tag
var need_nl_after = '|html|head|body|p|th|style|';

var re_comment = new RegExp("^<!--(([a]|[^a])*)-->$"); // #3
//re_comment.compile("^<!--(([a]|[^a])*)-->$");

var re_hyphen = new RegExp("-$"); // #3
//re_hyphen.compile("-$");


// Convert inner text of node to xhtml
// Call: get_xhtml(node);
//       get_xhtml(node, lang, encoding) -- to convert whole page
// other parameters are for inner usage and should be omitted
// Parameters:
// node - dom node to convert
// lang - document lang (need it if whole page converted)
// encoding - document charset (need it if whole page converted)
// need_nl - if true, add \n before a tag if it is in list need_nl_before
// inside_pre - if true, do not change content, as it is inside a <pre>
function get_xhtml(node, lang, encoding, need_nl, inside_pre) {

var i;
var text = '';
var children = node.childNodes;
var child_length = children.length;
var tag_name;
var do_nl = need_nl?true:false;
var page_mode = true;

	for (i=0;i<child_length;i++) {
		var child = children[i];

		//to prevent adding parts of html code twice in IE (thanks to Jorn Sjostrom)
		if (child.parentNode && String(node.tagName).toLowerCase() != String(child.parentNode.tagName).toLowerCase()) continue;

		switch (child.nodeType) {

			case 1: { //ELEMENT_NODE
				var tag_name = String(child.tagName).toLowerCase();

				if (tag_name == '') break;

				if (tag_name == 'meta') {
					var meta_name = String(child.name).toLowerCase();
					if (meta_name == 'generator') break;
				}

				//children nodes of <object> tags parsed incorrectly by ie-dom
				//so take their code and lowercase names of tags and attributes
				if (document.all && tag_name == 'object') {
//					text += fix_object_code(child.outerHTML);
//					continue;
				}

				if (!need_nl && tag_name == 'body') { //html fragment mode
					page_mode = false;
				}

				if (tag_name == '!') { //COMMENT_NODE in IE 5.0/5.5
					//get comment inner text
					var parts = re_comment.exec(child.text);

					if (parts) {
						//the last char of the comment text must not be a hyphen
						var inner_text = parts[1];
						text += fix_comment(inner_text);
					}
				}
				
				// #2 Added by pinhow 16-Mar-2007
				// If tag_name start with "/", skip it.
				if (tag_name.charAt(0) == '/') {
					continue;
				} else {
					if (tag_name == 'html'){
						text = '<?xml version="1.0" encoding="'+encoding+'"?>\n<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n';
					}

					//inset \n to make code more neat
					if (need_nl_before.indexOf('|'+tag_name+'|') != -1) {
						if ((do_nl || text != '') && !inside_pre) text += '\n';
							else do_nl = true;
					}

					text += '<'+tag_name;

					//add attributes
					var attr = child.attributes;
					var attr_length = attr.length;
					var attr_value;

					var attr_lang = false;
					var attr_xml_lang = false;
					var attr_xmlns = false;

					var is_alt_attr = false;


					for (j=0;j<attr_length;j++) {
						var attr_name = attr[j].nodeName.toLowerCase();

						if (!attr[j].specified &&
							attr_name != 'selected' &&
							attr_name != 'style' &&
							attr_name != 'value') continue; //IE 5.0

						if (attr_name == 'selected' &&
							!child.selected ||
							attr_name == 'style' && //IE 5.0
							child.style.cssText == '') continue;

						if (attr_name == '_moz_dirty' ||
							attr_name == '_moz_resizing' ||
							tag_name == 'br' && attr_name == 'type' &&
							child.getAttribute('type') == '_moz') continue;

						var valid_attr = true;

						switch (attr_name) {
							case "style" :
								attr_value = child.style.cssText;
								break;
							case "class" :
								attr_value = child.className;
								break;
							case "http-equiv":
								attr_value = child.httpEquiv;
								break;
							case "noshade": //this set of choices will extend
							case "checked":
							case "selected":
							case "multiple":
							case "nowrap":
							case "disabled":
								attr_value = attr_name;
								break;
							case "name":
								attr_value = child.name;
								break;
							case "for":
								attr_value = child.htmlFor;
								break;
							default:
								try {
									attr_value = child.getAttribute(attr_name, 2);
								} catch (e) {
									valid_attr = false;
								}
						}

						//html tag attribs
						if (attr_name == 'lang' && tag_name == 'html') {
							attr_lang = true;
							attr_value = lang;
						}
						if (attr_name == 'xml:lang') {
							attr_xml_lang = true;
							attr_value = lang;
						}
						if (attr_name == 'xmlns') attr_xmlns = true;

						if (tag_name == 'object' && attr_name == 'src' &&
							document.all) {

							//fix src attribute in IE
							attr_value = fix_object_src(child.outerHTML);
						}

						if (valid_attr) {
							//value attribute set to "0" is not handled correctly in Mozilla
							if (!(tag_name == 'li' && attr_name == 'value')) {
								text += ' '+attr_name+'="'+fix_attribute(attr_value)+'"';
							}
						}

						if (attr_name == 'alt') is_alt_attr = true;

					}

					if (tag_name == 'img' && !is_alt_attr) {
						text += ' alt=""';
					}

					if (tag_name == 'html') {
						if (!attr_lang) text += ' lang="'+lang+'"';
						if (!attr_xml_lang) text += ' xml:lang="'+lang+'"';
						if (!attr_xmlns) text += ' xmlns="http://www.w3.org/1999/xhtml"';
					}

					if (child.canHaveChildren || child.hasChildNodes()){
						text += '>';
						if (need_nl_after.indexOf('|'+tag_name+'|') != -1) {
//							text += '\n';
						}
						text += get_xhtml(child, lang, encoding, true,
					inside_pre||tag_name=='pre'?true:false);
						text += '</'+tag_name+'>';
					} else {

						//these tags must have closing tags
						//'a' included as otherwise Mozilla extends <a /> links
						//on content coming after the link, that is wrong
						if (tag_name == 'style' || tag_name == 'title' ||
							tag_name == 'script' || tag_name == 'textarea' ||
							tag_name == 'a') {

							text += '>';
							var inner_text;
							if (tag_name == 'script') {
								inner_text = child.text;
								// #1 Added by pinhow 14-Mar-2007
								inner_text = inner_text.replace(/&/g, "&amp;");
								inner_text = inner_text.replace(/</g, "&lt;");
								inner_text = inner_text.replace(/>/g, "&gt;");
							}else inner_text = child.innerHTML;

							if (tag_name == 'style') {
								inner_text = String(inner_text).replace(/[\n]+/g,'\n');
							}

							text += inner_text+'</'+tag_name+'>';

						} else {
							text += ' />';
						}
					}

				}
				break;
			}
			case 3: { //TEXT_NODE
				if (!inside_pre) { //do not change text inside <pre> tag
					if (child.nodeValue != '\n') {
						text += fix_entities(fix_text(child.nodeValue));
					}
				} else text += child.nodeValue;
				break;
			}
			case 8: { //COMMENT_NODE
				text += fix_comment(child.nodeValue);
				break;
			}
			default:
				break;
		}
	}

	if (!need_nl && !page_mode) { //delete head and body tags from html fragment
			text = text.replace(/<\/?head>[\n]*/gi, "");
			text = text.replace(/<head \/>[\n]*/gi, "");
			text = text.replace(/<\/?body>[\n]*/gi, "");
	}

	return text;
}

//fix inner text of a comment
function fix_comment(text){

	//delete double hyphens from the comment text
	text = text.replace(/--/g, "__");

	if(re_hyphen.exec(text)){ //last char must not be a hyphen
		text += " ";
	}

	return "<!--"+text+"-->";
}

//fix content of a text node
function fix_text(text) {
	//convert <,> and & to the corresponding entities

	//change &lt; and &gt; or the next string convert their & chars
	var temp_text = String(text).replace(/\&lt;/g, "#h2x_lt").replace(/\&gt;/g, "#h2x_gt");
	temp_text = temp_text.replace(/\n{2,}/g, "\n").replace(/\&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/\u00A0/g, "&nbsp;");
	return temp_text.replace(/#h2x_lt/g, "&lt;").replace(/#h2x_gt/g, "&gt;");
}

//fix content of attributes href, src or background
function fix_attribute(text) {
	//convert <,>, & and " to the corresponding entities

	//change &lt; and &gt; or the next string convert their & chars
	var temp_text = String(text).replace(/\&lt;/g, "#h2x_lt").replace(/\&gt;/g, "#h2x_gt");
	temp_text = temp_text.replace(/\&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/\"/g, "&quot;");
	return temp_text.replace(/#h2x_lt/g, "&lt;").replace(/#h2x_gt/g, "&gt;");
}

//fix value of src attribute of flash object set to null by default
function fix_object_src(text) {
var temp = String(text);

var obj_tag_parts = text.match(/<object ([^>]+)>/i);
	if(obj_tag_parts){
var src_value = obj_tag_parts[1].match(/src="([^"]+)"/i)
		if (!src_value) {
			src_value = obj_tag_parts[1].match(/src='([^']+)'/i);

			if (!src_value) {
				src_value = obj_tag_parts[1].match(/src=([^ ]+)/i);
			}
		}

		if (src_value) return src_value[1];
	}

	return '';

}

//fix entities, eg &euro;
function fix_entities(text) {
var i;
var ents = {
	
	"nbsp"		: 160,	
	"euro" 		: 8364,
	"fnof" 		: 402,
	"permil" 	: 8240,
	"Scaron" 	: 352,
	"OElig"		: 338,
	"trade"		: 8482,
	"scaron"	: 353,
	"oelig"		: 339,
	"Yuml"		: 376,
	"cent"		: 162,
	"pound"		: 163,
	"curren"	: 164,
	"yen"		: 165,
	"brvbar"	: 166,
	"sect"		: 167,
	"uml"		: 168,
	"copy"		: 169,
	
	"ordf"		: 170,
	"laquo"		: 171,
	"not"		: 172,
	"shy"		: 173,
	"reg"		: 174,
	"macr"		: 175,
	"deg"		: 176,
	"plusmn"	: 177,
	"sup2"		: 178,
	"sup3"		: 179,
	"acute"		: 180,
	"micro"		: 181,
	"para"		: 182,
	"middot"	: 183,
	"cedil"		: 184,
	"sup1"		: 185,
	"ordm"		: 186,
	"raquo"		: 187,
	"frac14"	: 188,
	"frac12"	: 189,
	
	"frac34"	: 190,
	"iquest"	: 191,
	"Agrave"	: 192,
	"Aacute"	: 193,
	"Acirc"		: 194,
	"Atilde"	: 195,
	"Auml"		: 196,
	"Aring"		: 197,
	"AElig"		: 198,
	"Ccedil"	: 199,
	"Egrave"	: 200,
	
	"Eacute"	: 201,
	"Ecirc"		: 202,
	"Euml"		: 203,
	"Igrave"	: 204,
	"Iacute"	: 205,
	"Icirc"		: 206,
	"Iuml"		: 207,
	"ETH"		: 208,
	"Ntilde"	: 209,
	
	/////////////////
	
	"Ograve"	: 210,
	"Oacute"	: 211,
	"Ocirc"		: 212,
	"Otilde"	: 213,
	"Ouml"		: 214,
	"times"		: 215,
	"Oslash"	: 216,
	"Ugrave"	: 217,
	"Uacute"	: 218,
	"Ucirc"		: 219,
	"Uuml"		: 220,
	"Yacute"	: 221,
	"THORN"		: 222,
	"szlig"		: 223,
	"agrave"	: 224,
	"aacute"	: 225,
	"acirc"		: 226,
	"atilde"	: 227,
	"auml"		: 228,
	"aring"		: 229,
	
	"aelig"		: 230,
	"ccedil"	: 231,
	"egrave"	: 232,
	"eacute"	: 233,
	"ecirc"		: 234,
	"euml"		: 235,
	"igrave"	: 236,
	"iacute"	: 237,
	"icirc"		: 238,
	"iuml"		: 239,
	"eth"		: 240,
	"ntilde"	: 241,
	"ograve"	: 242,
	"oacute"	: 243,
	"ocirc"		: 244,
	"otilde"	: 245,
	"ouml"		: 246,
	"divide"	: 247,
	"oslash"	: 248,
	"ugrave"	: 249,
	"uacute"	: 250,
	"ucirc"		: 251,
	"uuml"		: 252,
	"yacute"	: 253,
	"thorn"		: 254,
	"yuml"		: 255,
	
	
	"Alpha"		: 913,
	"Beta"		: 914,
	"Gamma"		: 915,
	"Delta"		: 916,
	"Epsilon"	: 917,
	"Zeta"		: 918,
	"Eta"		: 919,
	"Theta"		: 920,
	"Iota"		: 921,
	"Kappa"		: 922,
	"Lambda"	: 923,
	"Mu"		: 924,
	"Nu"		: 925,
	"Xi"		: 926,
	"Omicron"	: 927,
	"Pi"		: 928,
	"Rho"		: 929,
	
	"Sigma"		: 931,
	"Tau"		: 932,
	"Upsilon"	: 933,
	"Phi"		: 934,
	"Chi"		: 935,
	"Psi"		: 936,
	"Omega"		: 937,
	
	"there4"	: 8756,
	"perp"		: 8869,
	
	"alpha"		: 945,
	"beta"		: 946,
	"gamma"		: 947,
	"delta"		: 948,
	"epsilon"	: 949,
	"zeta"		: 950,
	"eta"		: 951,
	"theta"		: 952,
	"iota"		: 953,
	"kappa"		: 954,
	"lambda"	: 955,
	"mu"		: 956,
	"nu"		: 957,
	"xi"		: 968,
	"omicron"	: 969,
	"pi"		: 960,
	"rho"		: 961,
	"sigmaf"	: 962,
	"sigma"		: 963,
	"tau"		: 964,
	"upsilon"	: 965,
	"phi"		: 966,
	"chi"		: 967,
	"psi"		: 968,
	"omega"		: 969,
	
	"oline"		: 8254,
	"le"		: 8804,
	"frasl"		: 8260,
	"infin"		: 8734,
	"int"		: 8747,
	"clubs"		: 9827,
	"diams"		: 9830,
	"hearts"	: 9829,
	"spades"	: 9824,
	"harr"		: 8596,
	"larr"		: 8592,
	"rarr"		: 8594,
	"uarr"		: 8593,
	"darr"		: 8595,
	"ldquo"		: 8220,
	"rdquo"		: 8221,
	"bdquo"		: 8222,
	"ge"		: 8805,
	"prop"		: 8733,
	"part"		: 8706,
	"bull"		: 8226,
	"ne"		: 8800,
	"equiv"		: 8801,
	"asymp"		: 8776,
	"hellip"	: 8230,
	"mdash"		: 8212,
	"cap"		: 8745,
	"cup"		: 8746,
	"sup"		: 8835,
	"supe"		: 8839,
	"sub"		: 8834,
	"sube"		: 8838,
	"isin"		: 8712,
	"ni"		: 8715,
	"ang"		: 8736,
	"nabla"		: 8711,
	"prod"		: 8719,
	"radic"		: 8730,
	"and"		: 8743,
	"or"		: 8744,
	"hArr"		: 8660,
	"rArr"		: 8658,
	"loz"		: 9674,
	"sum"		: 8721,
	
	"forall"	: 8704,
	"exist"		: 8707,
	"lsquo"		: 8216,
	"rsquo"		: 8217,
	"iexcl"		: 161,
	
// other entities
	"thetasym"	: 977,
	"upsih"		: 978,
	"piv"		: 982,
	"prime"		: 8242,
	"Prime"		: 8243,
	"weierp"	: 8472,
	"image"		: 8465,
	"real"		: 8476,
	"alefsym"	: 8501,
	"crarr"		: 8629,
	"lArr"		: 8656,
	"uArr"		: 8657,
	"dArr"		: 8659,
	"empty"		: 8709,
	"notin"		: 8713,
	"lowast"	: 8727,
	"sim"		: 8764,
	"cong"		: 8773,
	"nsub"		: 8836,
	"oplus"		: 8853,
	"otimes"	: 8855,
	"sdot"		: 8901,
	"lceil"		: 8968,
	"rceil"		: 8969,
	"lfloor"	: 8970,
	"rfloor"	: 8971,
	"lang"		: 9001,
	"rang"		: 9002,
	"circ"		: 710,
	"tilde"		: 732,
	"ensp"		: 8194,
	"emsp"		: 8195,
	"thinsp"	: 8201,
	"zwnj"		: 8204,
	"zwj"		: 8205,
	"lrm"		: 8206,
	"rlm"		: 8207,
	"ndash"		: 8211,
	"sbquo"		: 8218,
	"dagger"	: 8224,
	"Dagger"	: 8225,
	"lsaquo"	: 8249,
	"rsaquo"	: 8250

};

	var new_text = '';

var temp = new RegExp("[a]|[^a]", "g"); // #3
	//temp.compile("[a]|[^a]", "g");

	var parts = text.match(temp);

	if (!parts) return text;
	for (i=0; i<parts.length; i++) {
		var c_code = parseInt(parts[i].charCodeAt());
		if (ents[c_code]) {
			new_text += "&#"+ents[c_code]+";"; // 4
		} else new_text += parts[i];
	}

	return new_text;
}

