html的转义字符怎样通过代码识别

  偶尔会在数据中看到诸如'这样的字符,特征如下
 
  以&#开头,中间是一串数字,以;结尾
 
  以&开头,中间一串字符,以;结尾
 
  比如最常见的 或者等价的 
 
  浏览器遇到这些转义符,会转义回来,但如何通过代码识别?org.apache.commons.lang.StringEscapeUtils.unescapeHtml提供了很好的说明
 
  遇到上面的第一种情况,中间是数字的,直接将数字(unicode)转为char
 
  遇到第二情况,中间是字符,只能查映射表了,从映射表中找到字符对应的数字再转换为char看看代码就一目了然了
 
  看看HTML40如何定义的
 
  static{
 
  HTML40=newEntities();
 
  fillWithHtml40Entities(HTML40);
 
  }
 
  staticvoidfillWithHtml40Entities(Entitiesentities){
 
  entities.addEntities(BASIC_ARRAY);
 
  entities.addEntities(ISO8859_1_ARRAY);
 
  entities.addEntities(HTML40_ARRAY);
 
  }
 
  再看看BASIC_ARRAY、ISO8859_1_ARRAY、HTML40_ARRAY分别是什么
 
  BASIC_ARRAY
 
  privatestaticfinalString[][]BASIC_ARRAY={{"quot","34"},//"-double-quote
 
  {"amp","38"},//&-ampersand
 
  {"lt","60"},//<-less-than
 
  {"gt","62"},//>-greater-than
 
  };
 
  ISO8859_1_ARRAY
 
  
 
  
 
  
 
  
 
  
 
  
 
  staticfinalString[][]ISO8859_1_ARRAY={{"nbsp","160"},//non-breakingspace
 
  {"iexcl","161"},//invertedexclamationmark
 
  {"cent","162"},//centsign
 
  {"pound","163"},//poundsign
 
  {"curren","164"},//currencysign
 
  {"yen","165"},//yensign=yuansign
 
  {"brvbar","166"},//brokenbar=brokenverticalbar
 
  {"sect","167"},//sectionsign
 
  {"uml","168"},//diaeresis=spacingdiaeresis
 
  {"copy","169"},//�-copyrightsign
 
  {"ordf","170"},//feminineordinalindicator
 
  {"laquo","171"},//left-pointingdoubleanglequotationmark=leftpointingguillemet
 
  {"not","172"},//notsign
 
  {"shy","173"},//softhyphen=discretionaryhyphen
 
  {"reg","174"},//�-registeredtrademarksign
 
  {"macr","175"},//macron=spacingmacron=overline=APLoverbar
 
  {"deg","176"},//degreesign
 
  {"plusmn","177"},//plus-minussign=plus-or-minussign
 
  {"sup2","178"},//superscripttwo=superscriptdigittwo=squared
 
  {"sup3","179"},//superscriptthree=superscriptdigitthree=cubed
 
  {"acute","180"},//acuteaccent=spacingacute
 
  {"micro","181"},//microsign
 
  {"para","182"},//pilcrowsign=paragraphsign
 
  {"middot","183"},//middledot=Georgiancomma=Greekmiddledot
 
  {"cedil","184"},//cedilla=spacingcedilla
 
  {"sup1","185"},//superscriptone=superscriptdigitone
 
  {"ordm","186"},//masculineordinalindicator
 
  {"raquo","187"},//right-pointingdoubleanglequotationmark=rightpointingguillemet
 
  {"frac14","188"},//vulgarfractiononequarter=fractiononequarter
 
  {"frac12","189"},//vulgarfractiononehalf=fractiononehalf
 
  {"frac34","190"},//vulgarfractionthreequarters=fractionthreequarters
 
  {"iquest","191"},//invertedquestionmark=turnedquestionmark
 
  {"Agrave","192"},//�-uppercaseA,graveaccent
 
  {"Aacute","193"},//�-uppercaseA,acuteaccent
 
  {"Acirc","194"},//�-uppercaseA,circumflexaccent
 
  {"Atilde","195"},//�-uppercaseA,tilde
 
  {"Auml","196"},//�-uppercaseA,umlaut
 
  {"Aring","197"},//�-uppercaseA,ring
 
  {"AElig","198"},//�-uppercaseAE
 
  {"Ccedil","199"},//�-uppercaseC,cedilla
 
  {"Egrave","200"},//�-uppercaseE,graveaccent
 
  {"Eacute","201"},//�-uppercaseE,acuteaccent
 
  {"Ecirc","202"},//�-uppercaseE,circumflexaccent
 
  {"Euml","203"},//�-uppercaseE,umlaut
 
  {"Igrave","204"},//�-uppercaseI,graveaccent
 
  {"Iacute","205"},//�-uppercaseI,acuteaccent
 
  {"Icirc","206"},//�-uppercaseI,circumflexaccent
 
  {"Iuml","207"},//�-uppercaseI,umlaut
 
  {"ETH","208"},//�-uppercaseEth,Icelandic
 
  {"Ntilde","209"},//�-uppercaseN,tilde
 
  {"Ograve","210"},//�-uppercaseO,graveaccent
 
  {"Oacute","211"},//�-uppercaseO,acuteaccent
 
  {"Ocirc","212"},//�-uppercaseO,circumflexaccent
 
  {"Otilde","213"},//�-uppercaseO,tilde
 
  {"Ouml","214"},//�-uppercaseO,umlaut
 
  {"times","215"},//multiplicationsign
 
  {"Oslash","216"},//�-uppercaseO,slash
 
  {"Ugrave","217"},//�-uppercaseU,graveaccent
 
  {"Uacute","218"},//�-uppercaseU,acuteaccent
 
  {"Ucirc","219"},//�-uppercaseU,circumflexaccent
 
  {"Uuml","220"},//�-uppercaseU,umlaut
 
  {"Yacute","221"},//�-uppercaseY,acuteaccent
 
  {"THORN","222"},//�-uppercaseTHORN,Icelandic
 
  {"szlig","223"},//�-lowercasesharps,German
 
  {"agrave","224"},//�-lowercasea,graveaccent
 
  {"aacute","225"},//�-lowercasea,acuteaccent
 
  {"acirc","226"},//�-lowercasea,circumflexaccent
 
  {"atilde","227"},//�-lowercasea,tilde
 
  {"auml","228"},//�-lowercasea,umlaut
 
  {"aring","229"},//�-lowercasea,ring
 
  {"aelig","230"},//�-lowercaseae
 
  {"ccedil","231"},//�-lowercasec,cedilla
 
  {"egrave","232"},//�-lowercasee,graveaccent
 
  {"eacute","233"},//�-lowercasee,acuteaccent
 
  {"ecirc","234"},//�-lowercasee,circumflexaccent
 
  {"euml","235"},//�-lowercasee,umlaut
 
  {"igrave","236"},//�-lowercasei,graveaccent
 
  {"iacute","237"},//�-lowercasei,acuteaccent
 
  {"icirc","238"},//�-lowercasei,circumflexaccent
 
  {"iuml","239"},//�-lowercasei,umlaut
 
  {"eth","240"},//�-lowercaseeth,Icelandic
 
  {"ntilde","241"},//�-lowercasen,tilde
 
  {"ograve","242"},//�-lowercaseo,graveaccent
 
  {"oacute","243"},//�-lowercaseo,acuteaccent
 
  {"ocirc","244"},//�-lowercaseo,circumflexaccent
 
  {"otilde","245"},//�-lowercaseo,tilde
 
  {"ouml","246"},//�-lowercaseo,umlaut
 
  {"divide","247"},//divisionsign
 
  {"oslash","248"},//�-lowercaseo,slash
 
  {"ugrave","249"},//�-lowercaseu,graveaccent
 
  {"uacute","250"},//�-lowercaseu,acuteaccent
 
  {"ucirc","251"},//�-lowercaseu,circumflexaccent
 
  {"uuml","252"},//�-lowercaseu,umlaut
 
  {"yacute","253"},//�-lowercasey,acuteaccent
 
  {"thorn","254"},//�-lowercasethorn,Icelandic
 
  {"yuml","255"},//�-lowercasey,umlaut
 
  };
 
  HTML40_ARRAY
 
  
 
  
 
  
 
  
 
  
 
  
 
  
 
  
 
  
 
  
 
  
 
  
 
  staticfinalString[][]HTML40_ARRAY={
 
  //<!--LatinExtended-B-->
 
  {"fnof","402"},//latinsmallfwithhook=function=florin,U+0192ISOtech-->
 
  //<!--Greek-->
 
  {"Alpha","913"},//greekcapitalletteralpha,U+0391-->
 
  {"Beta","914"},//greekcapitalletterbeta,U+0392-->
 
  {"Gamma","915"},//greekcapitallettergamma,U+0393ISOgrk3-->
 
  {"Delta","916"},//greekcapitalletterdelta,U+0394ISOgrk3-->
 
  {"Epsilon","917"},//greekcapitalletterepsilon,U+0395-->
 
  {"Zeta","918"},//greekcapitalletterzeta,U+0396-->
 
  {"Eta","919"},//greekcapitallettereta,U+0397-->
 
  {"Theta","920"},//greekcapitallettertheta,U+0398ISOgrk3-->
 
  {"Iota","921"},//greekcapitalletteriota,U+0399-->
 
  {"Kappa","922"},//greekcapitalletterkappa,U+039A-->
 
  {"Lambda","923"},//greekcapitalletterlambda,U+039BISOgrk3-->
 
  {"Mu","924"},//greekcapitallettermu,U+039C-->
 
  {"Nu","925"},//greekcapitalletternu,U+039D-->
 
  {"Xi","926"},//greekcapitalletterxi,U+039EISOgrk3-->
 
  {"Omicron","927"},//greekcapitalletteromicron,U+039F-->
 
  {"Pi","928"},//greekcapitalletterpi,U+03A0ISOgrk3-->
 
  {"Rho","929"},//greekcapitalletterrho,U+03A1-->
 
  //<!--thereisnoSigmaf,andnoU+03A2charactereither-->
 
  {"Sigma","931"},//greekcapitallettersigma,U+03A3ISOgrk3-->
 
  {"Tau","932"},//greekcapitallettertau,U+03A4-->
 
  {"Upsilon","933"},//greekcapitalletterupsilon,U+03A5ISOgrk3-->
 
  {"Phi","934"},//greekcapitalletterphi,U+03A6ISOgrk3-->
 
  {"Chi","935"},//greekcapitalletterchi,U+03A7-->
 
  {"Psi","936"},//greekcapitalletterpsi,U+03A8ISOgrk3-->
 
  {"Omega","937"},//greekcapitalletteromega,U+03A9ISOgrk3-->
 
  {"alpha","945"},//greeksmallletteralpha,U+03B1ISOgrk3-->
 
  {"beta","946"},//greeksmallletterbeta,U+03B2ISOgrk3-->
 
  {"gamma","947"},//greeksmalllettergamma,U+03B3ISOgrk3-->
 
  {"delta","948"},//greeksmallletterdelta,U+03B4ISOgrk3-->
 
  {"epsilon","949"},//greeksmallletterepsilon,U+03B5ISOgrk3-->
 
  {"zeta","950"},//greeksmallletterzeta,U+03B6ISOgrk3-->
 
  {"eta","951"},//greeksmalllettereta,U+03B7ISOgrk3-->
 
  {"theta","952"},//greeksmalllettertheta,U+03B8ISOgrk3-->
 
  {"iota","953"},//greeksmallletteriota,U+03B9ISOgrk3-->
 
  {"kappa","954"},//greeksmallletterkappa,U+03BAISOgrk3-->
 
  {"lambda","955"},//greeksmallletterlambda,U+03BBISOgrk3-->
 
  {"mu","956"},//greeksmalllettermu,U+03BCISOgrk3-->
 
  {"nu","957"},//greeksmallletternu,U+03BDISOgrk3-->
 
  {"xi","958"},//greeksmallletterxi,U+03BEISOgrk3-->
 
  {"omicron","959"},//greeksmallletteromicron,U+03BFNEW-->
 
  {"pi","960"},//greeksmallletterpi,U+03C0ISOgrk3-->
 
  {"rho","961"},//greeksmallletterrho,U+03C1ISOgrk3-->
 
  {"sigmaf","962"},//greeksmallletterfinalsigma,U+03C2ISOgrk3-->
 
  {"sigma","963"},//greeksmalllettersigma,U+03C3ISOgrk3-->
 
  {"tau","964"},//greeksmalllettertau,U+03C4ISOgrk3-->
 
  {"upsilon","965"},//greeksmallletterupsilon,U+03C5ISOgrk3-->
 
  {"phi","966"},//greeksmallletterphi,U+03C6ISOgrk3-->
 
  {"chi","967"},//greeksmallletterchi,U+03C7ISOgrk3-->
 
  {"psi","968"},//greeksmallletterpsi,U+03C8ISOgrk3-->
 
  {"omega","969"},//greeksmallletteromega,U+03C9ISOgrk3-->
 
  {"thetasym","977"},//greeksmallletterthetasymbol,U+03D1NEW-->
 
  {"upsih","978"},//greekupsilonwithhooksymbol,U+03D2NEW-->
 
  {"piv","982"},//greekpisymbol,U+03D6ISOgrk3-->
 
  //<!--GeneralPunctuation-->
 
  {"bull","8226"},//bullet=blacksmallcircle,U+2022ISOpub-->
 
  //<!--bulletisNOTthesameasbulletoperator,U+2219-->
 
  {"hellip","8230"},//horizontalellipsis=threedotleader,U+2026ISOpub-->
 
  {"prime","8242"},//prime=minutes=feet,U+2032ISOtech-->
 
  {"Prime","8243"},//doubleprime=seconds=inches,U+2033ISOtech-->
 
  {"oline","8254"},//overline=spacingoverscore,U+203ENEW-->
 
  {"frasl","8260"},//fractionslash,U+2044NEW-->
 
  //<!--LetterlikeSymbols-->
 
  {"weierp","8472"},//scriptcapitalP=powerset=Weierstrassp,U+2118ISOamso-->
 
  {"image","8465"},//blacklettercapitalI=imaginarypart,U+2111ISOamso-->
 
  {"real","8476"},//blacklettercapitalR=realpartsymbol,U+211CISOamso-->
 
  {"trade","8482"},//trademarksign,U+2122ISOnum-->
 
  {"alefsym","8501"},//alefsymbol=firsttransfinitecardinal,U+2135NEW-->
 
  //<!--alefsymbolisNOTthesameashebrewletteralef,U+05D0althoughthe
 
  //sameglyphcouldbeusedtodepictbothcharacters-->
 
  //<!--Arrows-->
 
  {"larr","8592"},//leftwardsarrow,U+2190ISOnum-->
 
  {"uarr","8593"},//upwardsarrow,U+2191ISOnum-->
 
  {"rarr","8594"},//rightwardsarrow,U+2192ISOnum-->
 
  {"darr","8595"},//downwardsarrow,U+2193ISOnum-->
 
  {"harr","8596"},//leftrightarrow,U+2194ISOamsa-->
 
  {"crarr","8629"},//downwardsarrowwithcornerleftwards=carriagereturn,U+21B5NEW-->
 
  {"lArr","8656"},//leftwardsdoublearrow,U+21D0ISOtech-->
 
  //<!--ISO10646doesnotsaythatlArristhesameasthe'isimpliedby'
 
  //arrowbutalsodoesnothaveanyothercharacterforthatfunction.
 
  //So?lArrcanbeusedfor'isimpliedby'asISOtechsuggests-->
 
  {"uArr","8657"},//upwardsdoublearrow,U+21D1ISOamsa-->
 
  {"rArr","8658"},//rightwardsdoublearrow,U+21D2ISOtech-->
 
  //<!--ISO10646doesnotsaythisisthe'implies'characterbutdoesnot
 
  //haveanothercharacterwiththisfunctionso?rArrcanbeusedfor
 
  //'implies'asISOtechsuggests-->
 
  {"dArr","8659"},//downwardsdoublearrow,U+21D3ISOamsa-->
 
  {"hArr","8660"},//leftrightdoublearrow,U+21D4ISOamsa-->
 
  //<!--MathematicalOperators-->
 
  {"forall","8704"},//forall,U+2200ISOtech-->
 
  {"part","8706"},//partialdifferential,U+2202ISOtech-->
 
  {"exist","8707"},//thereexists,U+2203ISOtech-->
 
  {"empty","8709"},//emptyset=nullset=diameter,U+2205ISOamso-->
 
  {"nabla","8711"},//nabla=backwarddifference,U+2207ISOtech-->
 
  {"isin","8712"},//elementof,U+2208ISOtech-->
 
  {"notin","8713"},//notanelementof,U+2209ISOtech-->
 
  {"ni","8715"},//containsasmember,U+220BISOtech-->
 
  //<!--shouldtherebeamorememorablenamethan'ni'?-->
 
  {"prod","8719"},//n-aryproduct=productsign,U+220FISOamsb-->
 
  //<!--prodisNOTthesamecharacterasU+03A0'greekcapitalletterpi'
 
  //thoughthesameglyphmightbeusedforboth-->
 
  {"sum","8721"},//n-arysummation,U+2211ISOamsb-->
 
  //<!--sumisNOTthesamecharacterasU+03A3'greekcapitallettersigma'
 
  //thoughthesameglyphmightbeusedforboth-->
 
  {"minus","8722"},//minussign,U+2212ISOtech-->
 
  {"lowast","8727"},//asteriskoperator,U+2217ISOtech-->
 
  {"radic","8730"},//squareroot=radicalsign,U+221AISOtech-->
 
  {"prop","8733"},//proportionalto,U+221DISOtech-->
 
  {"infin","8734"},//infinity,U+221EISOtech-->
 
  {"ang","8736"},//angle,U+2220ISOamso-->
 
  {"and","8743"},//logicaland=wedge,U+2227ISOtech-->
 
  {"or","8744"},//logicalor=vee,U+2228ISOtech-->
 
  {"cap","8745"},//intersection=cap,U+2229ISOtech-->
 
  {"cup","8746"},//union=cup,U+222AISOtech-->
 
  {"int","8747"},//integral,U+222BISOtech-->
 
  {"there4","8756"},//therefore,U+2234ISOtech-->
 
  {"sim","8764"},//tildeoperator=varieswith=similarto,U+223CISOtech-->
 
  //<!--tildeoperatorisNOTthesamecharacterasthetilde,U+007E,although
 
  //thesameglyphmightbeusedtorepresentboth-->
 
  {"cong","8773"},//approximatelyequalto,U+2245ISOtech-->
 
  {"asymp","8776"},//almostequalto=asymptoticto,U+2248ISOamsr-->
 
  {"ne","8800"},//notequalto,U+2260ISOtech-->
 
  {"equiv","8801"},//identicalto,U+2261ISOtech-->
 
  {"le","8804"},//less-thanorequalto,U+2264ISOtech-->
 
  {"ge","8805"},//greater-thanorequalto,U+2265ISOtech-->
 
  {"sub","8834"},//subsetof,U+2282ISOtech-->
 
  {"sup","8835"},//supersetof,U+2283ISOtech-->
 
  //<!--notethatnsup,'notasupersetof,U+2283'isnotcoveredbythe
 
  //Symbolfontencodingandisnotincluded.Shoulditbe,forsymmetry?
 
  //ItisinISOamsn--><!ENTITYnsub","8836"},
 
  //notasubsetof,U+2284ISOamsn-->
 
  {"sube","8838"},//subsetoforequalto,U+2286ISOtech-->
 
  {"supe","8839"},//supersetoforequalto,U+2287ISOtech-->
 
  {"oplus","8853"},//circledplus=directsum,U+2295ISOamsb-->
 
  {"otimes","8855"},//circledtimes=vectorproduct,U+2297ISOamsb-->
 
  {"perp","8869"},//uptack=orthogonalto=perpendicular,U+22A5ISOtech-->
 
  {"sdot","8901"},//dotoperator,U+22C5ISOamsb-->
 
  //<!--dotoperatorisNOTthesamecharacterasU+00B7middledot-->
 
  //<!--MiscellaneousTechnical-->
 
  {"lceil","8968"},//leftceiling=aplupstile,U+2308ISOamsc-->
 
  {"rceil","8969"},//rightceiling,U+2309ISOamsc-->
 
  {"lfloor","8970"},//leftfloor=apldownstile,U+230AISOamsc-->
 
  {"rfloor","8971"},//rightfloor,U+230BISOamsc-->
 
  {"lang","9001"},//left-pointinganglebracket=bra,U+2329ISOtech-->
 
  //<!--langisNOTthesamecharacterasU+003C'lessthan'orU+2039'singleleft-pointinganglequotation
 
  //mark'-->
 
  {"rang","9002"},//right-pointinganglebracket=ket,U+232AISOtech-->
 
  //<!--rangisNOTthesamecharacterasU+003E'greaterthan'orU+203A
 
  //'singleright-pointinganglequotationmark'-->
 
  //<!--GeometricShapes-->
 
  {"loz","9674"},//lozenge,U+25CAISOpub-->
 
  //<!--MiscellaneousSymbols-->
 
  {"spades","9824"},//blackspadesuit,U+2660ISOpub-->
 
  //<!--blackhereseemstomeanfilledasopposedtohollow-->
 
  {"clubs","9827"},//blackclubsuit=shamrock,U+2663ISOpub-->
 
  {"hearts","9829"},//blackheartsuit=valentine,U+2665ISOpub-->
 
  {"diams","9830"},//blackdiamondsuit,U+2666ISOpub-->
 
  //<!--LatinExtended-A-->
 
  {"OElig","338"},//--latincapitalligatureOE,U+0152ISOlat2-->
 
  {"oelig","339"},//--latinsmallligatureoe,U+0153ISOlat2-->
 
  //<!--ligatureisamisnomer,thisisaseparatecharacterinsomelanguages-->
 
  {"Scaron","352"},//--latincapitalletterSwithcaron,U+0160ISOlat2-->
 
  {"scaron","353"},//--latinsmallletterswithcaron,U+0161ISOlat2-->
 
  {"Yuml","376"},//--latincapitalletterYwithdiaeresis,U+0178ISOlat2-->
 
  //<!--SpacingModifierLetters-->
 
  {"circ","710"},//--modifierlettercircumflexaccent,U+02C6ISOpub-->
 
  {"tilde","732"},//smalltilde,U+02DCISOdia-->
 
  //<!--GeneralPunctuation-->
 
  {"ensp","8194"},//enspace,U+2002ISOpub-->
 
  {"emsp","8195"},//emspace,U+2003ISOpub-->
 
  {"thinsp","8201"},//thinspace,U+2009ISOpub-->
 
  {"zwnj","8204"},//zerowidthnon-joiner,U+200CNEWRFC2070-->
 
  {"zwj","8205"},//zerowidthjoiner,U+200DNEWRFC2070-->
 
  {"lrm","8206"},//left-to-rightmark,U+200ENEWRFC2070-->
 
  {"rlm","8207"},//right-to-leftmark,U+200FNEWRFC2070-->
 
  {"ndash","8211"},//endash,U+2013ISOpub-->
 
  {"mdash","8212"},//emdash,U+2014ISOpub-->
 
  {"lsquo","8216"},//leftsinglequotationmark,U+2018ISOnum-->
 
  {"rsquo","8217"},//rightsinglequotationmark,U+2019ISOnum-->
 
  {"sbquo","8218"},//singlelow-9quotationmark,U+201ANEW-->
 
  {"ldquo","8220"},//leftdoublequotationmark,U+201CISOnum-->
 
  {"rdquo","8221"},//rightdoublequotationmark,U+201DISOnum-->
 
  {"bdquo","8222"},//doublelow-9quotationmark,U+201ENEW-->
 
  {"dagger","8224"},//dagger,U+2020ISOpub-->
 
  {"Dagger","8225"},//doubledagger,U+2021ISOpub-->
 
  {"permil","8240"},//permillesign,U+2030ISOtech-->
 
  {"lsaquo","8249"},//singleleft-pointinganglequotationmark,U+2039ISOproposed-->
 
  //<!--lsaquoisproposedbutnotyetISOstandardized-->
 
  {"rsaquo","8250"},//singleright-pointinganglequotationmark,U+203AISOproposed-->
 
  //<!--rsaquoisproposedbutnotyetISOstandardized-->
 
  {"euro","8364"},//--eurosign,U+20ACNEW-->
 
  };
 
  再扩展下
 
  从前面可以看到转义字符中间的那段数字是unicode,那么这个转移字符可以随便构造了,并不限于上面的定义,比如中的unicode是20013,那么构造一个转移字符&#20013;,经过浏览器的渲染就变回中了,虽然不必这么绕为了显示一个字符,但如果在一些不方便传输特殊字符的场景就可以派上用途了






本文转载自中文网
 

推荐阅读