偶尔会在数据中看到诸如'这样的字符,特征如下
以&#开头,中间是一串数字,以;结尾
以&开头,中间一串字符,以;结尾
比如最常见的 或者等价的 
浏览器遇到这些转义符,会转义回来,但如何通过代码识别?org.apache.commons.lang.StringEscapeUtils.unescapeHtml提供了很好的说明
遇到上面的第一种情况,中间是数字的,直接将数字(unicode)转为char
遇到第二情况,中间是字符,只能查映射表了,从映射表中找到字符对应的数字再转换为char看看代码就一目了然了
看看HTML40如何定义的
static{
HTML40=newEntities();
fillWithHtml40Entities(HTML40);
}
staticvoidfillWithHtml40Entities(Entitiesentities){
entities.addEntities(BASIC_ARRAY);
entities.addEntities(ISO8859_1_ARRAY);
entities.addEntities(HTML40_ARRAY);
}
再看看BASIC_ARRAY、ISO8859_1_ARRAY、HTML40_ARRAY分别是什么
BASIC_ARRAY
privatestaticfinalString[][]BASIC_ARRAY={{"quot","34"},//"-double-quote
{"amp","38"},//&-ampersand
{"lt","60"},//<-less-than
{"gt","62"},//>-greater-than
};
ISO8859_1_ARRAY
staticfinalString[][]ISO8859_1_ARRAY={{"nbsp","160"},//non-breakingspace
{"iexcl","161"},//invertedexclamationmark
{"cent","162"},//centsign
{"pound","163"},//poundsign
{"curren","164"},//currencysign
{"yen","165"},//yensign=yuansign
{"brvbar","166"},//brokenbar=brokenverticalbar
{"sect","167"},//sectionsign
{"uml","168"},//diaeresis=spacingdiaeresis
{"copy","169"},//�-copyrightsign
{"ordf","170"},//feminineordinalindicator
{"laquo","171"},//left-pointingdoubleanglequotationmark=leftpointingguillemet
{"not","172"},//notsign
{"shy","173"},//softhyphen=discretionaryhyphen
{"reg","174"},//�-registeredtrademarksign
{"macr","175"},//macron=spacingmacron=overline=APLoverbar
{"deg","176"},//degreesign
{"plusmn","177"},//plus-minussign=plus-or-minussign
{"sup2","178"},//superscripttwo=superscriptdigittwo=squared
{"sup3","179"},//superscriptthree=superscriptdigitthree=cubed
{"acute","180"},//acuteaccent=spacingacute
{"micro","181"},//microsign
{"para","182"},//pilcrowsign=paragraphsign
{"middot","183"},//middledot=Georgiancomma=Greekmiddledot
{"cedil","184"},//cedilla=spacingcedilla
{"sup1","185"},//superscriptone=superscriptdigitone
{"ordm","186"},//masculineordinalindicator
{"raquo","187"},//right-pointingdoubleanglequotationmark=rightpointingguillemet
{"frac14","188"},//vulgarfractiononequarter=fractiononequarter
{"frac12","189"},//vulgarfractiononehalf=fractiononehalf
{"frac34","190"},//vulgarfractionthreequarters=fractionthreequarters
{"iquest","191"},//invertedquestionmark=turnedquestionmark
{"Agrave","192"},//�-uppercaseA,graveaccent
{"Aacute","193"},//�-uppercaseA,acuteaccent
{"Acirc","194"},//�-uppercaseA,circumflexaccent
{"Atilde","195"},//�-uppercaseA,tilde
{"Auml","196"},//�-uppercaseA,umlaut
{"Aring","197"},//�-uppercaseA,ring
{"AElig","198"},//�-uppercaseAE
{"Ccedil","199"},//�-uppercaseC,cedilla
{"Egrave","200"},//�-uppercaseE,graveaccent
{"Eacute","201"},//�-uppercaseE,acuteaccent
{"Ecirc","202"},//�-uppercaseE,circumflexaccent
{"Euml","203"},//�-uppercaseE,umlaut
{"Igrave","204"},//�-uppercaseI,graveaccent
{"Iacute","205"},//�-uppercaseI,acuteaccent
{"Icirc","206"},//�-uppercaseI,circumflexaccent
{"Iuml","207"},//�-uppercaseI,umlaut
{"ETH","208"},//�-uppercaseEth,Icelandic
{"Ntilde","209"},//�-uppercaseN,tilde
{"Ograve","210"},//�-uppercaseO,graveaccent
{"Oacute","211"},//�-uppercaseO,acuteaccent
{"Ocirc","212"},//�-uppercaseO,circumflexaccent
{"Otilde","213"},//�-uppercaseO,tilde
{"Ouml","214"},//�-uppercaseO,umlaut
{"times","215"},//multiplicationsign
{"Oslash","216"},//�-uppercaseO,slash
{"Ugrave","217"},//�-uppercaseU,graveaccent
{"Uacute","218"},//�-uppercaseU,acuteaccent
{"Ucirc","219"},//�-uppercaseU,circumflexaccent
{"Uuml","220"},//�-uppercaseU,umlaut
{"Yacute","221"},//�-uppercaseY,acuteaccent
{"THORN","222"},//�-uppercaseTHORN,Icelandic
{"szlig","223"},//�-lowercasesharps,German
{"agrave","224"},//�-lowercasea,graveaccent
{"aacute","225"},//�-lowercasea,acuteaccent
{"acirc","226"},//�-lowercasea,circumflexaccent
{"atilde","227"},//�-lowercasea,tilde
{"auml","228"},//�-lowercasea,umlaut
{"aring","229"},//�-lowercasea,ring
{"aelig","230"},//�-lowercaseae
{"ccedil","231"},//�-lowercasec,cedilla
{"egrave","232"},//�-lowercasee,graveaccent
{"eacute","233"},//�-lowercasee,acuteaccent
{"ecirc","234"},//�-lowercasee,circumflexaccent
{"euml","235"},//�-lowercasee,umlaut
{"igrave","236"},//�-lowercasei,graveaccent
{"iacute","237"},//�-lowercasei,acuteaccent
{"icirc","238"},//�-lowercasei,circumflexaccent
{"iuml","239"},//�-lowercasei,umlaut
{"eth","240"},//�-lowercaseeth,Icelandic
{"ntilde","241"},//�-lowercasen,tilde
{"ograve","242"},//�-lowercaseo,graveaccent
{"oacute","243"},//�-lowercaseo,acuteaccent
{"ocirc","244"},//�-lowercaseo,circumflexaccent
{"otilde","245"},//�-lowercaseo,tilde
{"ouml","246"},//�-lowercaseo,umlaut
{"divide","247"},//divisionsign
{"oslash","248"},//�-lowercaseo,slash
{"ugrave","249"},//�-lowercaseu,graveaccent
{"uacute","250"},//�-lowercaseu,acuteaccent
{"ucirc","251"},//�-lowercaseu,circumflexaccent
{"uuml","252"},//�-lowercaseu,umlaut
{"yacute","253"},//�-lowercasey,acuteaccent
{"thorn","254"},//�-lowercasethorn,Icelandic
{"yuml","255"},//�-lowercasey,umlaut
};
HTML40_ARRAY
staticfinalString[][]HTML40_ARRAY={
//<!--LatinExtended-B-->
{"fnof","402"},//latinsmallfwithhook=function=florin,U+0192ISOtech-->
//<!--Greek-->
{"Alpha","913"},//greekcapitalletteralpha,U+0391-->
{"Beta","914"},//greekcapitalletterbeta,U+0392-->
{"Gamma","915"},//greekcapitallettergamma,U+0393ISOgrk3-->
{"Delta","916"},//greekcapitalletterdelta,U+0394ISOgrk3-->
{"Epsilon","917"},//greekcapitalletterepsilon,U+0395-->
{"Zeta","918"},//greekcapitalletterzeta,U+0396-->
{"Eta","919"},//greekcapitallettereta,U+0397-->
{"Theta","920"},//greekcapitallettertheta,U+0398ISOgrk3-->
{"Iota","921"},//greekcapitalletteriota,U+0399-->
{"Kappa","922"},//greekcapitalletterkappa,U+039A-->
{"Lambda","923"},//greekcapitalletterlambda,U+039BISOgrk3-->
{"Mu","924"},//greekcapitallettermu,U+039C-->
{"Nu","925"},//greekcapitalletternu,U+039D-->
{"Xi","926"},//greekcapitalletterxi,U+039EISOgrk3-->
{"Omicron","927"},//greekcapitalletteromicron,U+039F-->
{"Pi","928"},//greekcapitalletterpi,U+03A0ISOgrk3-->
{"Rho","929"},//greekcapitalletterrho,U+03A1-->
//<!--thereisnoSigmaf,andnoU+03A2charactereither-->
{"Sigma","931"},//greekcapitallettersigma,U+03A3ISOgrk3-->
{"Tau","932"},//greekcapitallettertau,U+03A4-->
{"Upsilon","933"},//greekcapitalletterupsilon,U+03A5ISOgrk3-->
{"Phi","934"},//greekcapitalletterphi,U+03A6ISOgrk3-->
{"Chi","935"},//greekcapitalletterchi,U+03A7-->
{"Psi","936"},//greekcapitalletterpsi,U+03A8ISOgrk3-->
{"Omega","937"},//greekcapitalletteromega,U+03A9ISOgrk3-->
{"alpha","945"},//greeksmallletteralpha,U+03B1ISOgrk3-->
{"beta","946"},//greeksmallletterbeta,U+03B2ISOgrk3-->
{"gamma","947"},//greeksmalllettergamma,U+03B3ISOgrk3-->
{"delta","948"},//greeksmallletterdelta,U+03B4ISOgrk3-->
{"epsilon","949"},//greeksmallletterepsilon,U+03B5ISOgrk3-->
{"zeta","950"},//greeksmallletterzeta,U+03B6ISOgrk3-->
{"eta","951"},//greeksmalllettereta,U+03B7ISOgrk3-->
{"theta","952"},//greeksmalllettertheta,U+03B8ISOgrk3-->
{"iota","953"},//greeksmallletteriota,U+03B9ISOgrk3-->
{"kappa","954"},//greeksmallletterkappa,U+03BAISOgrk3-->
{"lambda","955"},//greeksmallletterlambda,U+03BBISOgrk3-->
{"mu","956"},//greeksmalllettermu,U+03BCISOgrk3-->
{"nu","957"},//greeksmallletternu,U+03BDISOgrk3-->
{"xi","958"},//greeksmallletterxi,U+03BEISOgrk3-->
{"omicron","959"},//greeksmallletteromicron,U+03BFNEW-->
{"pi","960"},//greeksmallletterpi,U+03C0ISOgrk3-->
{"rho","961"},//greeksmallletterrho,U+03C1ISOgrk3-->
{"sigmaf","962"},//greeksmallletterfinalsigma,U+03C2ISOgrk3-->
{"sigma","963"},//greeksmalllettersigma,U+03C3ISOgrk3-->
{"tau","964"},//greeksmalllettertau,U+03C4ISOgrk3-->
{"upsilon","965"},//greeksmallletterupsilon,U+03C5ISOgrk3-->
{"phi","966"},//greeksmallletterphi,U+03C6ISOgrk3-->
{"chi","967"},//greeksmallletterchi,U+03C7ISOgrk3-->
{"psi","968"},//greeksmallletterpsi,U+03C8ISOgrk3-->
{"omega","969"},//greeksmallletteromega,U+03C9ISOgrk3-->
{"thetasym","977"},//greeksmallletterthetasymbol,U+03D1NEW-->
{"upsih","978"},//greekupsilonwithhooksymbol,U+03D2NEW-->
{"piv","982"},//greekpisymbol,U+03D6ISOgrk3-->
//<!--GeneralPunctuation-->
{"bull","8226"},//bullet=blacksmallcircle,U+2022ISOpub-->
//<!--bulletisNOTthesameasbulletoperator,U+2219-->
{"hellip","8230"},//horizontalellipsis=threedotleader,U+2026ISOpub-->
{"prime","8242"},//prime=minutes=feet,U+2032ISOtech-->
{"Prime","8243"},//doubleprime=seconds=inches,U+2033ISOtech-->
{"oline","8254"},//overline=spacingoverscore,U+203ENEW-->
{"frasl","8260"},//fractionslash,U+2044NEW-->
//<!--LetterlikeSymbols-->
{"weierp","8472"},//scriptcapitalP=powerset=Weierstrassp,U+2118ISOamso-->
{"image","8465"},//blacklettercapitalI=imaginarypart,U+2111ISOamso-->
{"real","8476"},//blacklettercapitalR=realpartsymbol,U+211CISOamso-->
{"trade","8482"},//trademarksign,U+2122ISOnum-->
{"alefsym","8501"},//alefsymbol=firsttransfinitecardinal,U+2135NEW-->
//<!--alefsymbolisNOTthesameashebrewletteralef,U+05D0althoughthe
//sameglyphcouldbeusedtodepictbothcharacters-->
//<!--Arrows-->
{"larr","8592"},//leftwardsarrow,U+2190ISOnum-->
{"uarr","8593"},//upwardsarrow,U+2191ISOnum-->
{"rarr","8594"},//rightwardsarrow,U+2192ISOnum-->
{"darr","8595"},//downwardsarrow,U+2193ISOnum-->
{"harr","8596"},//leftrightarrow,U+2194ISOamsa-->
{"crarr","8629"},//downwardsarrowwithcornerleftwards=carriagereturn,U+21B5NEW-->
{"lArr","8656"},//leftwardsdoublearrow,U+21D0ISOtech-->
//<!--ISO10646doesnotsaythatlArristhesameasthe'isimpliedby'
//arrowbutalsodoesnothaveanyothercharacterforthatfunction.
//So?lArrcanbeusedfor'isimpliedby'asISOtechsuggests-->
{"uArr","8657"},//upwardsdoublearrow,U+21D1ISOamsa-->
{"rArr","8658"},//rightwardsdoublearrow,U+21D2ISOtech-->
//<!--ISO10646doesnotsaythisisthe'implies'characterbutdoesnot
//haveanothercharacterwiththisfunctionso?rArrcanbeusedfor
//'implies'asISOtechsuggests-->
{"dArr","8659"},//downwardsdoublearrow,U+21D3ISOamsa-->
{"hArr","8660"},//leftrightdoublearrow,U+21D4ISOamsa-->
//<!--MathematicalOperators-->
{"forall","8704"},//forall,U+2200ISOtech-->
{"part","8706"},//partialdifferential,U+2202ISOtech-->
{"exist","8707"},//thereexists,U+2203ISOtech-->
{"empty","8709"},//emptyset=nullset=diameter,U+2205ISOamso-->
{"nabla","8711"},//nabla=backwarddifference,U+2207ISOtech-->
{"isin","8712"},//elementof,U+2208ISOtech-->
{"notin","8713"},//notanelementof,U+2209ISOtech-->
{"ni","8715"},//containsasmember,U+220BISOtech-->
//<!--shouldtherebeamorememorablenamethan'ni'?-->
{"prod","8719"},//n-aryproduct=productsign,U+220FISOamsb-->
//<!--prodisNOTthesamecharacterasU+03A0'greekcapitalletterpi'
//thoughthesameglyphmightbeusedforboth-->
{"sum","8721"},//n-arysummation,U+2211ISOamsb-->
//<!--sumisNOTthesamecharacterasU+03A3'greekcapitallettersigma'
//thoughthesameglyphmightbeusedforboth-->
{"minus","8722"},//minussign,U+2212ISOtech-->
{"lowast","8727"},//asteriskoperator,U+2217ISOtech-->
{"radic","8730"},//squareroot=radicalsign,U+221AISOtech-->
{"prop","8733"},//proportionalto,U+221DISOtech-->
{"infin","8734"},//infinity,U+221EISOtech-->
{"ang","8736"},//angle,U+2220ISOamso-->
{"and","8743"},//logicaland=wedge,U+2227ISOtech-->
{"or","8744"},//logicalor=vee,U+2228ISOtech-->
{"cap","8745"},//intersection=cap,U+2229ISOtech-->
{"cup","8746"},//union=cup,U+222AISOtech-->
{"int","8747"},//integral,U+222BISOtech-->
{"there4","8756"},//therefore,U+2234ISOtech-->
{"sim","8764"},//tildeoperator=varieswith=similarto,U+223CISOtech-->
//<!--tildeoperatorisNOTthesamecharacterasthetilde,U+007E,although
//thesameglyphmightbeusedtorepresentboth-->
{"cong","8773"},//approximatelyequalto,U+2245ISOtech-->
{"asymp","8776"},//almostequalto=asymptoticto,U+2248ISOamsr-->
{"ne","8800"},//notequalto,U+2260ISOtech-->
{"equiv","8801"},//identicalto,U+2261ISOtech-->
{"le","8804"},//less-thanorequalto,U+2264ISOtech-->
{"ge","8805"},//greater-thanorequalto,U+2265ISOtech-->
{"sub","8834"},//subsetof,U+2282ISOtech-->
{"sup","8835"},//supersetof,U+2283ISOtech-->
//<!--notethatnsup,'notasupersetof,U+2283'isnotcoveredbythe
//Symbolfontencodingandisnotincluded.Shoulditbe,forsymmetry?
//ItisinISOamsn--><!ENTITYnsub","8836"},
//notasubsetof,U+2284ISOamsn-->
{"sube","8838"},//subsetoforequalto,U+2286ISOtech-->
{"supe","8839"},//supersetoforequalto,U+2287ISOtech-->
{"oplus","8853"},//circledplus=directsum,U+2295ISOamsb-->
{"otimes","8855"},//circledtimes=vectorproduct,U+2297ISOamsb-->
{"perp","8869"},//uptack=orthogonalto=perpendicular,U+22A5ISOtech-->
{"sdot","8901"},//dotoperator,U+22C5ISOamsb-->
//<!--dotoperatorisNOTthesamecharacterasU+00B7middledot-->
//<!--MiscellaneousTechnical-->
{"lceil","8968"},//leftceiling=aplupstile,U+2308ISOamsc-->
{"rceil","8969"},//rightceiling,U+2309ISOamsc-->
{"lfloor","8970"},//leftfloor=apldownstile,U+230AISOamsc-->
{"rfloor","8971"},//rightfloor,U+230BISOamsc-->
{"lang","9001"},//left-pointinganglebracket=bra,U+2329ISOtech-->
//<!--langisNOTthesamecharacterasU+003C'lessthan'orU+2039'singleleft-pointinganglequotation
//mark'-->
{"rang","9002"},//right-pointinganglebracket=ket,U+232AISOtech-->
//<!--rangisNOTthesamecharacterasU+003E'greaterthan'orU+203A
//'singleright-pointinganglequotationmark'-->
//<!--GeometricShapes-->
{"loz","9674"},//lozenge,U+25CAISOpub-->
//<!--MiscellaneousSymbols-->
{"spades","9824"},//blackspadesuit,U+2660ISOpub-->
//<!--blackhereseemstomeanfilledasopposedtohollow-->
{"clubs","9827"},//blackclubsuit=shamrock,U+2663ISOpub-->
{"hearts","9829"},//blackheartsuit=valentine,U+2665ISOpub-->
{"diams","9830"},//blackdiamondsuit,U+2666ISOpub-->
//<!--LatinExtended-A-->
{"OElig","338"},//--latincapitalligatureOE,U+0152ISOlat2-->
{"oelig","339"},//--latinsmallligatureoe,U+0153ISOlat2-->
//<!--ligatureisamisnomer,thisisaseparatecharacterinsomelanguages-->
{"Scaron","352"},//--latincapitalletterSwithcaron,U+0160ISOlat2-->
{"scaron","353"},//--latinsmallletterswithcaron,U+0161ISOlat2-->
{"Yuml","376"},//--latincapitalletterYwithdiaeresis,U+0178ISOlat2-->
//<!--SpacingModifierLetters-->
{"circ","710"},//--modifierlettercircumflexaccent,U+02C6ISOpub-->
{"tilde","732"},//smalltilde,U+02DCISOdia-->
//<!--GeneralPunctuation-->
{"ensp","8194"},//enspace,U+2002ISOpub-->
{"emsp","8195"},//emspace,U+2003ISOpub-->
{"thinsp","8201"},//thinspace,U+2009ISOpub-->
{"zwnj","8204"},//zerowidthnon-joiner,U+200CNEWRFC2070-->
{"zwj","8205"},//zerowidthjoiner,U+200DNEWRFC2070-->
{"lrm","8206"},//left-to-rightmark,U+200ENEWRFC2070-->
{"rlm","8207"},//right-to-leftmark,U+200FNEWRFC2070-->
{"ndash","8211"},//endash,U+2013ISOpub-->
{"mdash","8212"},//emdash,U+2014ISOpub-->
{"lsquo","8216"},//leftsinglequotationmark,U+2018ISOnum-->
{"rsquo","8217"},//rightsinglequotationmark,U+2019ISOnum-->
{"sbquo","8218"},//singlelow-9quotationmark,U+201ANEW-->
{"ldquo","8220"},//leftdoublequotationmark,U+201CISOnum-->
{"rdquo","8221"},//rightdoublequotationmark,U+201DISOnum-->
{"bdquo","8222"},//doublelow-9quotationmark,U+201ENEW-->
{"dagger","8224"},//dagger,U+2020ISOpub-->
{"Dagger","8225"},//doubledagger,U+2021ISOpub-->
{"permil","8240"},//permillesign,U+2030ISOtech-->
{"lsaquo","8249"},//singleleft-pointinganglequotationmark,U+2039ISOproposed-->
//<!--lsaquoisproposedbutnotyetISOstandardized-->
{"rsaquo","8250"},//singleright-pointinganglequotationmark,U+203AISOproposed-->
//<!--rsaquoisproposedbutnotyetISOstandardized-->
{"euro","8364"},//--eurosign,U+20ACNEW-->
};
再扩展下
从前面可以看到转义字符中间的那段数字是unicode,那么这个转移字符可以随便构造了,并不限于上面的定义,比如中的unicode是20013,那么构造一个转移字符中,经过浏览器的渲染就变回中了,虽然不必这么绕为了显示一个字符,但如果在一些不方便传输特殊字符的场景就可以派上用途了
本文转载自中文网 |