| << 3 ˌ ̹皂 | 5 ˌ >> |
һ ˸ ˼ ը comp.lang.python ȫ “ȡ һ һ˂ HTMLը ݵ [headers|images|links] һЂ ˅?” “ȡ һ 롒 һ˂ HTML ը˂ [parse|translate|munge] ˅?” “ȡ һ һ˂ HTML տ [add|remove|quote] ˅?” ˌ˼ ݵ ը .
˟, һ ˅. տ ˼, BaseHTMLProcessor.py뵴, ˼ f 븇 HTML 쵵 煫 ՠ˞ . տ ˼, dialect.py뵴, ˼ 롒 HTML ը˂ տ챉 ˤ BaseHTMLProcessor.py ȍ վ 煫 . doc string 煹 ܡ Dz 翂 ˅ ҡ . ˼ , Ҁ ݵ ̹ ȡ ̉ 騂챉 ը. ۂ , ݵ ˼ һ .
Example 4.1. BaseHTMLProcessor.py
, ˼ ꡜ ̴ ȍ ϵ ˅ (Windows, UNIX, Mac OS).
from sgmllib import SGMLParser class BaseHTMLProcessor(SGMLParser): def reset(self): # extend (called by SGMLParser.__init__) self.pieces = [] SGMLParser.reset(self) def unknown_starttag(self, tag, attrs): # called for each start tag # attrs is a list of (attr, value) tuples # e.g. for <pre class="screen">, tag="pre", attrs=[("class", "screen")] # Ideally we would like to reconstruct original tag and attributes, but # we may end up quoting attribute values that weren't quoted in the source # document, or we may change the type of quotes around the attribute value # (single to double quotes). # Note that improperly embedded non-HTML code (like client-side Javascript) # may be parsed incorrectly by the ancestor, causing runtime script errors. # All non-HTML code must be enclosed in HTML comment tags (<!-- code -->) # to ensure that it will pass through this parser unaltered (in handle_comment). strattrs = "".join([' %s="%s"' % (key, value) for key, value in attrs]) self.pieces.append("<%(tag)s%(strattrs)s>" % locals()) def unknown_endtag(self, tag): # called for each end tag, e.g. for </pre>, tag will be "pre" # Reconstruct the original end tag. self.pieces.append("</%(tag)s>" % locals()) def handle_charref(self, ref): # called for each character reference, e.g. for " ", ref will be "160" # Reconstruct the original character reference. self.pieces.append("&#%(ref)s;" % locals()) def handle_entityref(self, ref): # called for each entity reference, e.g. for "©", ref will be "copy" # Reconstruct the original entity reference. self.pieces.append("&%(ref)s;" % locals()) def handle_data(self, text): # called for each block of plain text, i.e. outside of any tag and # not containing any character or entity references # Store the original text verbatim. self.pieces.append(text) def handle_comment(self, text): # called for each HTML comment, e.g. <!-- insert Javascript code here --> # Reconstruct the original comment. # It is especially important that the source document enclose client-side # code (like Javascript) within comments so it can pass through this # processor undisturbed; see comments in unknown_starttag for details. self.pieces.append("<!--%(text)s-->" % locals()) def handle_pi(self, text): # called for each processing instruction, e.g. <?instruction> # Reconstruct original processing instruction. self.pieces.append("<?%(text)s>" % locals()) def handle_decl(self, text): # called for the DOCTYPE, if present, e.g. # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" # "http://www.w3.org/TR/html4/loose.dtd"> # Reconstruct original DOCTYPE self.pieces.append("<!%(text)s>" % locals()) def output(self): """Return processed HTML as a single string""" return "".join(self.pieces)
import re from BaseHTMLProcessor import BaseHTMLProcessor class Dialectizer(BaseHTMLProcessor): subs = () def reset(self): # extend (called from __init__ in ancestor) # Reset all data attributes self.verbatim = 0 BaseHTMLProcessor.reset(self) def start_pre(self, attrs): # called for every <pre> tag in HTML source # Increment verbatim mode count, then handle tag like normal self.verbatim += 1 self.unknown_starttag("pre", attrs) def end_pre(self): # called for every </pre> tag in HTML source # Decrement verbatim mode count self.unknown_endtag("pre") self.verbatim -= 1 def handle_data(self, text): # override # called for every block of text in HTML source # If in verbatim mode, save text unaltered; # otherwise process the text with a series of substitutions self.pieces.append(self.verbatim and text or self.process(text)) def process(self, text): # called from handle_data # Process text block by performing series of regular expression # substitutions (actual substitions are defined in descendant) for fromPattern, toPattern in self.subs: text = re.sub(fromPattern, toPattern, text) return text class ChefDialectizer(Dialectizer): """convert HTML to Swedish Chef-speak based on the classic chef.x, copyright (c) 1992, 1993 John Hagerman """ subs = ((r'a([nu])', r'u\1'), (r'A([nu])', r'U\1'), (r'a\B', r'e'), (r'A\B', r'E'), (r'en\b', r'ee'), (r'\Bew', r'oo'), (r'\Be\b', r'e-a'), (r'\be', r'i'), (r'\bE', r'I'), (r'\Bf', r'ff'), (r'\Bir', r'ur'), (r'(\w*?)i(\w*?)$', r'\1ee\2'), (r'\bow', r'oo'), (r'\bo', r'oo'), (r'\bO', r'Oo'), (r'the', r'zee'), (r'The', r'Zee'), (r'th\b', r't'), (r'\Btion', r'shun'), (r'\Bu', r'oo'), (r'\BU', r'Oo'), (r'v', r'f'), (r'V', r'F'), (r'w', r'w'), (r'W', r'W'), (r'([a-z])[.]', r'\1. Bork Bork Bork!')) class FuddDialectizer(Dialectizer): """convert HTML to Elmer Fudd-speak""" subs = ((r'[rl]', r'w'), (r'qu', r'qw'), (r'th\b', r'f'), (r'th', r'd'), (r'n[.]', r'n, uh-hah-hah-hah.')) class OldeDialectizer(Dialectizer): """convert HTML to mock Middle English""" subs = ((r'i([bcdfghjklmnpqrstvwxyz])e\b', r'y\1'), (r'i([bcdfghjklmnpqrstvwxyz])e', r'y\1\1e'), (r'ick\b', r'yk'), (r'ia([bcdfghjklmnpqrstvwxyz])', r'e\1e'), (r'e[ea]([bcdfghjklmnpqrstvwxyz])', r'e\1e'), (r'([bcdfghjklmnpqrstvwxyz])y', r'\1ee'), (r'([bcdfghjklmnpqrstvwxyz])er', r'\1re'), (r'([aeiou])re\b', r'\1r'), (r'ia([bcdfghjklmnpqrstvwxyz])', r'i\1e'), (r'tion\b', r'cioun'), (r'ion\b', r'ioun'), (r'aid', r'ayde'), (r'ai', r'ey'), (r'ay\b', r'y'), (r'ay', r'ey'), (r'ant', r'aunt'), (r'ea', r'ee'), (r'oa', r'oo'), (r'ue', r'e'), (r'oe', r'o'), (r'ou', r'ow'), (r'ow', r'ou'), (r'\bhe', r'hi'), (r've\b', r'veth'), (r'se\b', r'e'), (r"'s\b", r'es'), (r'ic\b', r'ick'), (r'ics\b', r'icc'), (r'ical\b', r'ick'), (r'tle\b', r'til'), (r'll\b', r'l'), (r'ould\b', r'olde'), (r'own\b', r'oune'), (r'un\b', r'onne'), (r'rry\b', r'rye'), (r'est\b', r'este'), (r'pt\b', r'pte'), (r'th\b', r'the'), (r'ch\b', r'che'), (r'ss\b', r'sse'), (r'([wybdp])\b', r'\1e'), (r'([rnt])\b', r'\1\1e'), (r'from', r'fro'), (r'when', r'whan')) def translate(url, dialect="chef"): """fetch URL and translate using dialect dialect in ("chef", "fudd", "olde")""" import urllib sock = urllib.urlopen(url) htmlSource = sock.read() sock.close() parserName = "%sDialectizer" % dialect.capitalize() parserClass = globals()[parserName] parser = parserClass() parser.feed(htmlSource) parser.close() return parser.output() def test(url): """test all dialects against URL""" for dialect in ("chef", "fudd", "olde"): outfile = "%s.html" % dialect fsock = open(outfile, "wb") fsock.write(translate(url, dialect)) fsock.close() import webbrowser webbrowser.open_new(outfile) if __name__ == "__main__": test("http://diveintopython.org/odbchelper_list.html")
Example 4.3. Output of dialect.py
Ү Lists 101 (The Muppets(낟) һ˫) Swedish Chef˂-ȁ, (ָ ՚ һ˫) Elmer Fudd-ȁ , (˂ 꼣 ̾ 悄) 秹 տ . ̉ ˂ HTML ȓ, ˼ ݵ HTML ˼ ͫ һ, һ ȍ˂ ݲ “տ” . ȓ, ȍ˼, ӡ ը տ ; 졜 һ˫ ϵ˼ ͫ ˼ ˅.
HTML ˂ һ翫: HTML 禡 һ, 禡 챉, 禡 HTML ˍ챉. տ , ˅, sgmllib.py ˂ .
sgmllib.py ˂ ւ﵂ ˅: SGMLParser. ը, (parser) ˉ е ˅ ˼ ̹ е 禡 禡ȫ . ˂ е ˅; ˂ , ჲ ȍ, .INI , ˟ , robots.txt , XML 챉 ˤ ݵ ˅.
е˼ ȗ 皂, ˼ 졒 ס ȼ ˞ ˜ˌ졒, ȍ һ е ȍ챉 . SGMLParser ˜ۂ ; , 禡 ܂ , Dz է Ҁ ˅ ̉. ȍ챉 ˤ, ˼ SGMLParser Ђ졒 ۲.
SGMLParser HTML 8 ˂ 졒, ס굎 ˤ ˂ ̉:
| 2.0 ˼ ՚ 皡 ˅ SGMLParser ˟ 븀 (handle_decl ̉ ), ס˼ DOCTYPE Ǹ̵ ˂킄. 2.1Ы ۵. | |
sgmllib.py ո̂煫 ݵ . ˼ sgmllib.py ˅, ჲ HTML ˂ ҄ , ס˼ ̉ . SGMLParser Ђ졒 unknown_starttag, unknown_endtag, handle_data ˂ 빚 ̉ ˂븇 ס˼ .
| ˩˂ IDE, ˼ “Run script” ȗ ჲ 빚 ۂ ˅. | |
Example 4.4. Sample test of sgmllib.py
̴˂ HTML ՚˟, toc.html ˅ ˼ 禡 ˅.
<h1> <a name='c40a'></a> Dive Into Python </h1> <p class='pubdate'> 28 Feb 2001 </p> <p class='copyright'> Copyright copy 2000, 2001 by <a href='mailto:f8dy@diveintopython.org' title='send e-mail to the author'> Mark Pilgrim </a> </p> <p> <a name='c40ab2b4'></a> <b></b> </p> <p> This book lives at <a href='http://diveintopython.org/'> http://diveintopython.org/ </a> . If you're reading it somewhere else, you may not have the latest version. </p>
sgmllib.py˂ ݵ Ȑ̉:
start tag: <h1>
start tag: <a name="c40a" >
end tag: </a>
data: 'Dive Into Python'
end tag: </h1>
start tag: <p class="pubdate" >
data: '28 Feb 2001'
end tag: </p>
start tag: <p class="copyright" >
data: 'Copyright '
*** unknown entity ref: ©
data: ' 2000, 2001 by '
start tag: <a href="mailto:f8dy@diveintopython.org" title="send e-mail to the author" >
data: 'Mark Pilgrim'
end tag: </a>
end tag: </p>
start tag: <p>
start tag: <a name="c40ab2b4" >
end tag: </a>
start tag: <b>
end tag: </b>
end tag: </p>
start tag: <p>
data: 'This book lives at '
start tag: <a href="http://diveintopython.org/" >
data: 'http://diveintopython.org/'
end tag: </a>
data: ".\012If you're reading it somewhere else, you may not have the lates"
data: 't version.\012'
end tag: </p>
ˌ˂ һ ˤ 皵 ˅:
HTML ը ̧̉챉 ˤЫ, SGMLParser Ђ즗 һւ졒 ̹ ˼ ˤ ˂즗.
HTML ը ̧̉챉ˤ տ ˂ HTML . ˂ 쵌 ݸ ˂ HTML қ ˅, ˼ ﹚ ȍ ס ˅, һ ˍ ȓ ˅ ˴ HTML .
Example 4.5. Introducing urllib
>>> import urllib>>> sock = urllib.urlopen("http://diveintopython.org/")
>>> htmlSource = sock.read()
>>> sock.close()
>>> print htmlSource
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"><html><head> <meta http-equiv='Content-Type' content='text/html; charset=ISO-8859-1'> <title>Dive Into Python</title> <link rel='stylesheet' href='diveintopython.css' type='text/css'> <link rev='made' href='mailto:f8dy@diveintopython.org'> <meta name='keywords' content='Python, Dive Into Python, tutorial, object-oriented, programming, documentation, book, free'> <meta name='description' content='a free Python tutorial for experienced programmers'> </head> <body bgcolor='white' text='black' link='#0000FF' vlink='#840084' alink='#0000FF'> <table cellpadding='0' cellspacing='0' border='0' width='100%'> <tr><td class='header' width='1%' valign='top'>diveintopython.org</td> <td width='99%' align='right'><hr size='1' noshade></td></tr> <tr><td class='tagline' colspan='2'>Python for experienced programmers</td></tr> [...snip...]
Example 4.6. Introducing urllister.py
, ˼ ꡜ ̴ ȍ ϵ ˅(Windows, UNIX, Mac OS).
from sgmllib import SGMLParser class URLLister(SGMLParser): def reset(self):SGMLParser.reset(self) self.urls = [] def start_a(self, attrs):
href = [v for k, v in attrs if k=='href']
![]()
if href: self.urls.extend(href)
| reset˼ SGMLParser˂ __init__ ˂ ̉, ס˼ ˂ ̹ ̉ ˅. 汉С , __init__С 즗, reset 汉 즗, 븇 ס˼ ȍ ̹ ˍ-ȍ ˞ ˍ-汉С . | |
| start_a <a> է SGMLParser ˂ ̉. href ւ ˅, /˼ name title ˼, ˅. attrs Ҽ ˂ , [(attribute, value), (attribute, value), ...]. ˼ ס˼ <a> ˅, ( 皦) HTML , attrs˼ . | |
| <a> ס href 皡 ˅ - տ 皡 貮Ҳ ˅. | |
| k=='href' ˼ ը ҫ ȗ ը 롬, һ ˟, Ҁ SGMLParser attrs ̈ ը 챉 ը. |
Example 4.7. Using urllister.py
>>> import urllib, urllister >>> usock = urllib.urlopen("http://diveintopython.org/") >>> parser = urllister.URLLister() >>> parser.feed(usock.read())>>> usock.close()
>>> parser.close()
>>> for url in parser.urls: print url
toc.html #download toc.html history.html download/dip_pdf.zip download/dip_pdf.tgz download/dip_pdf.hqx download/diveintopython.pdf download/diveintopython.zip download/diveintopython.tgz download/diveintopython.hqx [...snip...]
| SGMLParser ˂, feed ̉, HTML ҅.[7] ס˼ ը ̑嵴, usock.read() ՠ . | |
| 쵎 坡, ˼ һ һ ̅ ȍ URL ̹ . | |
| ˼ ̹ , , һ ˅. feed ݵ HTML ˌ ; ס˼ ˂ HTML ՚ ˜ˌ졒, ˱ . ȗ , close ̉ ՚ ˜ˌ ݵ ˟ . | |
| С , ˼ һ, parser.urls ˼ HTML ը ˅ ݵ URL˂ 翫. |
SGMLParser ̹ ǡ굵 Ȑ̉ . ס˼ 졒 졒, , ס˼ է ˂ 굎 ˞͂ ̉, һ ǡ굵 . SGMLParser HTML (consumer): ס˼ HTML ̑졒 ס ˼, е 禡 ܂. ˟ , ˼ SGMLParser Ђ ۂ ˉȫ ˂ ˅ ˴ ݵ ˼ , 굎 Ȑȫ ˂ ˅. ˸ ˟̀ SGMLParser С է́ ݵ һւ졒 ˟ HTMLը ˍ ˂ . ˞ , HTML Ȑ(producer) .
BaseHTMLProcessor SGMLParser Ђ 8˂ ݵ ˞ ϡ: unknown_starttag, unknown_endtag, handle_charref, handle_entityref, handle_comment, handle_pi, handle_decl, and handle_data.
Example 4.8. Introducing BaseHTMLProcessor
class BaseHTMLProcessor(SGMLParser): def reset(self):self.pieces = [] SGMLParser.reset(self) def unknown_starttag(self, tag, attrs):
strattrs = "".join([' %s="%s"' % (key, value) for key, value in attrs]) self.pieces.append("<%(tag)s%(strattrs)s>" % locals()) def unknown_endtag(self, tag):
self.pieces.append("</%(tag)s>" % locals()) def handle_charref(self, ref):
self.pieces.append("&#%(ref)s;" % locals()) def handle_entityref(self, ref):
self.pieces.append("&%(ref)s;" % locals()) def handle_data(self, text):
self.pieces.append(text) def handle_comment(self, text):
self.pieces.append("<!--%(text)s-->" % locals()) def handle_pi(self, text):
self.pieces.append("<?%(text)s>" % locals()) def handle_decl(self, text): self.pieces.append("<!%(text)s>" % locals())
| reset˼, SGMLParser.__init__ ˂ ̉, self.pieces ȗ ̉챉˟ 汉Ђ. self.pieces 졒 ͫ HTML ը˂ 禡 . SGMLParser HTML ˍ , ը self.pieces ̧ . self.pieces 즗. ˼ 稹 ס ը ˂졒 ס 禡 ̧ 皵 . ס굵 , һ ˼ ፫嵴 ˅ ˞.[8] | |
| BaseHTMLProcessor ۂ ˤ (URLLister ˅ start_a ˼) ˂ Ղ, SGMLParser ݵ (start tag) unknown_starttag ̉ . (tag) / ˂ (attrs) ̑, ˂ HTML ˍ, ס self.pieces ̧. ը ȗ; ˸ ס . | |
| (end tags) ˍ ˼ ; 偩 ̑ '</...>' ҅. | |
| SGMLParser ը է, handle_charref ը 皡 ̉. HTML ըС   ւ졒 ˅, ref 160 . ˂ ˟ ը ˍ ˼ ref &#...;ը ҅ . | |
| ̹ 禫 ը , һ (-) . ˂ ̹ ˍ ˼ ref &...; ը ҅ . | |
| ˼ self.pieces ˼ ̧. | |
| HTML 煹˼ <!--...-> ը ˅. | |
| ჲ <?...> ը ˅. |
| HTML ȍ˼ (¦݁-̿ Ү ˼) ݵ -HTML˼ HTML 煹 稲 , һ ݵ ˴ 皡 ˞챉 ˼ ( ˂ ݵ ˴ ˜˼ 櫛 낄). BaseHTMLProcessor С ; Ү ˞ , Ы ˞ Ү HTML . , Ү һ ȍ ւ, SGMLParser է ˧ ࡢ . SGMLParser ȗ ը , Ү 皵 , (˂ HTML ըС ˼ ȍ 皦) BaseHTMLProcessor ȗ , 騸 Ү . ȗ ˂ ¦ݮ- Ү HTML 煹 飂즗. | |
Example 4.9. BaseHTMLProcessor output
def output(self):"""Return processed HTML as a single string""" return "".join(self.pieces)
| ˼ ȗ SGMLParser ˂ ̉ , BaseHTMLProcessor ˅ һ˂ . ˂ ˍ HTML self.pieces ˜ˌՂ, ﹚ ݵ 禡 һ˂ ը ͂嵴 . ˟ , ˼ 졒 ը ˜ ՟, ȍ ˞ ס ˟ ը . | |
| , ˼ string ݵ˂ join ȍ ˅ : string.join(self.pieces, "") |
˼ ˂ ˌ ﹚ 翫, locals globals, ס˼ ˟ ȍ˟-悄 碱 ϡ.
˜, ˂ . ˼ 禂 , һ , Ղ 즗. ˼ 굎 ȍ ̧˞ 皂. ˼ ȍ˟ ˂ ȍ˟˂ ˼ ˂ . ȍ, ˼ ȍ˟ 碱 ˅, ˇ ס .
ۂ , ˂ ȍ. ﹚ ˂ 翫, 嵴, ﹚˂ ̧˞ 皂. ﹚ 빚 ˞ ˂ ւ. ݵ˼ ˂ 翫, ˟ 롒 嵴, ݵ˂ ̧˞ 皂. ﹚, , ݵ ݵ, ݵ- ȗ ւ. ˌ ˅嵴, ݵ ݵ 碱, ˌ ﹚ .
˂ x˂ , ˼ ݵ , Ы, ̣ :
е x ̣ , ˼ ֱ졒 'There is no variable named 'x'(x ˼ ˸)' 皡 NameError ӫ. ˼ ס ղ 1 ˌ , һ ˼ 煱 ˟ һ ˼ 졒 ˅ ȍ .
2.2 Ț ς ҂ : Ȯֵ (nested scopes). 2.0, һ˂ Ȯֵ ﹚ ֫ (lambda) ﹚ 禂 , ˼ ˍ˂ (Ȯֵ lambda) ﹚˂ , һ ݵ˂ Ț . 2.2 ˍ˂ (Ȯֵ lambda) ﹚˂ , Ы ݂﹚˂ , һ ݵ˂ Ț . 2.1˼ վ ݵ ˅; , ס˼ 2.0 , һ ˼ ˸˂ ˂ ݵ ȗ ̧ ˂ ݵ 2.2 쵵 ˅:
from __future__ import nested_scopes | |
Ы 굎 坡, ˼ - ˞ 碱. ˼ ˌ locals ﹚ 碱졒, ˟ (ݵ ) ˼ ˌ globals ﹚ 碱.
Example 4.10. Introducing locals
>>> def foo(arg):... x = 1 ... print locals() ... >>> foo(7)
{'arg': 7, 'x': 1} >>> foo('bar')
{'arg': 'bar', 'x': 1}
| ﹚ foo ˂ ˂ 皡 ˅: arg, ˼ ﹚ ײ翫, x, ˼ ﹚ ˂. | |
| locals / ȍ˟ ՠ. ȍ˟˂ ը ˂ ; ȍ˟˂ ˼ ˂ ϡ. foo 7 ̉ ﹚˂ ˂ ւ ȍ˟ ̉傄: arg (7) x (1). | |
| 즗, ˼ ˞ 翫, ˼ ը arg ˅; ﹚ ( locals ̉) ˟ ˧ . locals˼ ݵ ˂ ݵ . |
locals (﹚) , globals ˟ (ݵ) . , globals , Ҁ ݵ˂ ˅ ը.[9] ݵ˂ ˼ ݵ-˂ ȗ ˅ 즗, ݵ ˂ ݵ ﹚ ւ. , ס˼ ݵ ւ.
from module import import moduleȍ˂ ? import module ȍ, ݵ ̹ , һ ס˼ ˂ 皂, ס ݵ ȍ ݵ˂ ﹚ ˼ 碱 : module.function. һ from module import ȍ, ˼ ݵ ۂ ﹚ ˂ , ס ˂ ݵ 禂 桒 ˞ 碱 ( ˅) . globals ﹚, ˼ ȗ ˅.
Example 4.11. Introducing globals
Add the following block to BaseHTMLProcessor.py:
if __name__ == "__main__": for k, v in globals().items():print k, "=", v
| ˂ , ˟ ݵ 즗. globals ﹚ ȍ˟ ՠ, items - ȍ ȍ˟ ՠՂ. ț 삄 ˼ globals ﹚. |
Ү ჲ ̉ 篫:
c:\docbook\dip\py>python BaseHTMLProcessor.py
SGMLParser = sgmllib.SGMLParser__doc__ = None
BaseHTMLProcessor = __main__.BaseHTMLProcessor
__name__ = __main__
__builtins__ = <module '__builtin__' (built-in)>
| SGMLParser from module import ȍ, sgmllib . ס ˞ ˂ ݵ ﵂ ס ˅. | |
| ݵ ݵ˼ doc string 皫嵴, ˌ __doc__ 碱. ݵ˼ ˞ ס ˂ Ղ, ס˼ None ȡ . | |
| ݵ˼ , BaseHTMLProcessor, ˂, ˅. ˅ ˼ ˂ ̹ 즗, ̹ 즗. | |
| if __name__ trick ? ݵ ( ݵ ס ꡜ 禂), ˌ __name__ ˼, , __main__. ݵ ჲ Ү Ղ, __name__˼ __main__, ס globals ̉챉 ˤ ˂ ˼ . |
| locals globals ﹚ ȍ, ˼ ˂˞ ˂ ˞ յ, ը ϡ ˅. ˼ getattr ﹚˂ ȫ嵴, ס˼ ﹚˂ ը ϡ븇 ˂˞ ﹚ ˞ 碱 ˅ 篫. | |
ȼ˞, ˼ (ȍ˟ ˼) վ ̧˞; locals globals ȼ˂ ˞ ̢ը. ס ݵ , ݵ , ݵ , ς . ˼ ݵ ˼ ̹ һ˂ ? , ț ˅: ݵ ˼ ȍ˟.
ը ը 챉 ˤ վ ϡ. ˼ һе ը ˂ ﵂翫. ˼ ˞ ՠ, ﵂ , ס˼ ȗ ܱ ˌ ˼ . ˼ ը տ f 킯 ; ˼ ը ܫ ꡜ ܫ ՠՂ.
ȍ ȍ˟ ȍ ˞ ˂ ը С ˅.
Example 4.12. Introducing dictionary-based string formatting
>>> params = {"server":"mpilgrim", "database":"master", "uid":"sa", "pwd":"secret"} >>> "%(pwd)s" % params'secret' >>> "%(pwd)s is not a good password for %(uid)s" % params
'secret is not a good password for sa' >>> "%(database)s of mind, %(database)s of body" % params
'master of mind, master of body'
| ˞ 皫 , ˂ ը Ы ȍ˟, params ȍ. ը %s , ˼ 翫. ˼ params ȍ˟˂ ȍ皡 ȗ˂ , secret, %(pwd)s . | |
| ȍ˟- ը Ы ҹ˂ ˅ 굵 . 煲 ȍ˟ ˍ , Ы KeyError 皡 ܂ . | |
| ˼ 皲 ˼ տ ۂ ˅; ̉ ˼ . |
˼ ȍ˟-悄 ը ȍ硒 ? ˸, ˸ 皫 ȍ˟ ۂ ը ˼ ; ˂˅ 皡 ˅ ס˼ . locals.
Example 4.13. Dictionary-based string formatting in BaseHTMLProcessor.py
def handle_comment(self, text): self.pieces.append("<!--%(text)s-->" % locals())
| ˌ locals ﹚ ȍ ˼ ȍ˟-悄 ը ˂ ˌ 鍂 ȍ. ס ˼ ˂ ը ˂ ȍ ˅ ˂ ( , text뵴, ס˼ 빚 ҄ҫ) ˂ 皲 ˂ ˂킄. text 'Begin page footer', ը "<!--%(text)s-->" % locals() ը '<!--Begin page footer-->' |
def unknown_starttag(self, tag, attrs): strattrs = "".join([' %s="%s"' % (key, value) for key, value in attrs])self.pieces.append("<%(tag)s%(strattrs)s>" % locals())
![]()
̉, attrs˼ / ˂ , ȍ˟˂ items ˼, ס˼ - ȍ ס ՠՂ ˅ ˂킄. ˼ 皱皫 ւ ܁ , һ Ы 翂 ˼ ˅, Ղ ס ܂ :
| |
| , ȍ˟-悄 ը ȍ, tag strattrs˂ ը . tag 'a', '<a href="index.html" title="Go to home page">' , ס˼ self.pieces ̧ . |
| locals 皡 ȍ˟-悄 ը ȍ ˼ վ ˉ ը ܱ , һ ס . locals ̉嵴 ˂ ̾ ˅. ՠ˞, ۂ ̾܂ , һ ( տ ւ) ը 皡 ˅, ˼ ȗ˞ -悄 . | |
comp.lang.python 鍂 ը ˅. “һ ˼ 皫 쫆˂ HTML ըе 皡 ˅嵴, һ ס굎 ݵ ˞ 졒 . ȡ һ ˅?”[10] ( ը˼ ՠ˞ 礮 ˂ 皫嵴 ˼ HTML˼-- Ҋ 皡 礮 ͂졒 ݵ 皵˼ HTML ՠ ˌ. е ˼ ˼ HTML 鍂 ˤՠˤ.) 妵, ˼ ˼ HTML BaseHTMLProcessor ҅ 븇, ˅.
BaseHTMLProcessor HTML (Ҁ ס˼ SGMLParser˂ ը) HTML Ֆ Ҹ, һ HTML ̉塇 塜 . ˼ ס굎 ը ˼ 鴂͵ 皦, ը , ˼, ס굎 ˼ 皦 , 皡 . ˅ ˼ .
Example 4.14. Quoting attribute values
>>> htmlSource = """... <html> ... <head> ... <title>Test page</title> ... </head> ... <body> ... <ul> ... <li><a href=index.html>Home</a></li> ... <li><a href=toc.html>Table of contents</a></li> ... <li><a href=history.html>Revision history</a></li> ... </body> ... </html> ... """ >>> from BaseHTMLProcessor import BaseHTMLProcessor >>> parser = BaseHTMLProcessor() >>> parser.feed(htmlSource)
>>> print parser.output()
<html> <head> <title>Test page</title> </head> <body> <ul> <li><a href="index.html">Home</a></li> <li><a href="toc.html">Table of contents</a></li> <li><a href="history.html">Revision history</a></li> </body> </html>
| ˼ <a> ˅ href ˂ ˞ . ( ˼ ը ը (doc string)ȡ ˤ ȕ ȍ졒 ˅ . IDE ˞, (ȍ). ס굎˼ .) | |
| (parser) . | |
| BaseHTMLProcessor ˂ output ﹚ ȍ, ̉塇, 칼, ˂ ը յ. , һ ˼ 쵎 첔һ ࡢ : SGMLParser ˟̹ HTML ը , ס tags, refs, data, ܂; BaseHTMLProcessor ȍ HTML 禡 ˍ ( ס굎 , ס˼ parser.pieces ˟ ˜ˌ); , parser.output ̉, ס˼ ݵ HTML 禡 ˂ ը ͂. |
Dialectizer BaseHTMLProcessor˂ ( ˜) . ס˼ ˂ ̹۹ ́ӫ, һ ס˼ 騸 <pre>...</pre> ˅ 굵 ́ӫ.
<pre> 챉 ˤ, ˂ Dialectizer ˂: start_pre end_pre.
Example 4.15. Handling specific tags
def start_pre(self, attrs):self.verbatim += 1
self.unknown_starttag("pre", attrs)
def end_pre(self):
self.unknown_endtag("pre")
self.verbatim -= 1
| start_pre SGMLParser <pre> HTML է տ ̉. (ˇ , 騂 ȡ 첔һ ȓ .) ˂ Ҽ, attrs ̑, ס˼ ( ˅) ˂ ˅. attrs, unknown_starttag ̑ / ˂ . | |
| reset , <pre> ˤ 汉Ђ. <pre> һ , 盡́ӫ; </pre> , ́ӫ. ( ȍ ˅ ס 1 ۂ졒 0 ˍۂ, һ վ˼ ꡜ 坡 ˅, վ˼ Ȯֵ <pre> צ 킄 ( ) .) ˇ , ȡ ȍ ˅ ȓ . | |
| ס, ס <pre> ˤ 삄 . ˼ unknown_starttag « 篫 ס˼ ۵ ˅. | |
| end_pre SGMLParser </pre> է ̉. ւ Ղ, Ҽ ̑ . | |
| տ, (end tag) 坡 , ۵ 챉 . | |
| տ, һ ̀ <pre> ˼ 翫. |
, SGMLParser 禱 ˼ ˅. һ 禂嵴 ( ˼ 皱 ס 첔 뵴) SGMLParser ˂ ˤ , ˍ, ̣ ̉ . , վ start_pre end_pre˂ ˂ <pre> </pre> . һ ȡ ? ˸, ס˼ , ס˼ ˂ .
def finish_starttag(self, tag, attrs):try: method = getattr(self, 'start_' + tag)
except AttributeError:
try: method = getattr(self, 'do_' + tag)
except AttributeError: self.unknown_starttag(tag, attrs)
return -1 else: self.handle_starttag(tag, method, attrs)
return 0 else: self.stack.append(tag) self.handle_starttag(tag, method, attrs) return 1 def handle_starttag(self, tag, method, attrs): method(attrs)
| Ы, SGMLParser (start tag) է졒 . ˼ 삄 ˼ ˤ ۂ ˅ ˼ (unknown_starttag) ˂ 貮ȫ . | |
| SGMLParser˂ “”˼ ˂ , getattr . ˟ ˅ ˼ getattr ̹˂ ͵ ̹ ˂ ̣ . ̹ self, ˍ˂ ̹. tag 'pre', getattr̉˼ ˍ ̹ start_pre ̣ , ס˼ Dialectizer ˂ ̹. | |
| ̣ ˅ ̹ ˍ (˼ ˂ ), getattr˼ AttributeError ӫ, һ ס˼ ըϡ , Ҁ getattr ̉ try...except ҅ ˞ AttributeError һւ ը. | |
| start_xxx է Ղ, ֱ챉 ˟ do_xxx ̣ȫ. ̹ ˟˼ <br> ˼ ˞ ұ ˤ ՠ˞ ȍ, ס굎˼ ȗ˂ . һ ˼ ߱ ˟ ȍ ˅; ̂, SGMLParser ݵ ̵. (, ˼ start_xxx do_xxx ˼ ˂Ы ; start_xxx ̉ .) | |
| һ˂ AttributeError, ס˼ getattr ̉ do_xxx ܂ ˂킄. start_xxx do_xxx է Ղ, һւ , unknown_starttag ˂皂. | |
| ˼, try...except ˼ else ˅, ס˼ try...except 첔һ ̉. Ғ˞, ס˼ ˤ do_xxx է ˂, ס ̉ . | |
| start_xxx do_xxx ˞ ̉ ; , , ˼ ﹚, handle_starttag 翫, ͵˼ ס ۲ ˅ ݵ ס Ȳ皫 վ ˅. ۵ ˂ ϲ 皫 , ˂ 쵵 뱉 , (start_xxx do_xxx) ˂ ̉ . ˼, method ﹚, getattr ՠ, ﹚ ̹. ( һ , ˼ ˂ ˤ ס ț վ ̣ һ 皂 .) , ﹚ ̹ ȱ 빚 ײ翫, ղ ﹚ ̉. , ﹚ Dz, Dz, ˼ ˂ ; ﹚ 貮 삄 ˅ ס ˂ 빚, attrs ̉ . |
˂ յ ղ: Dialectizer. , <pre> </pre> ˤ ˂졒 ˅ . Ӳ ˅, ס˼ -˂ ա . ס ˤ, handle_data ۲ ˅.
Example 4.17. Overriding the handle_data method
def handle_data(self, text):self.pieces.append(self.verbatim and text or self.process(text))
| handle_data һ˂ 빚, ̉. | |
| ȗ BaseHTMLProcessor, handle_data ̉ ՚, self.pieces ̧. Ғ ˉ ۵. <pre>...</pre> ˅, self.verbatim˼ 0 , ̉ ՚ ˼ ҅ . , ˂ ̉ 졒, ˂ ̉ ՚ ҅ . , ˼, and-or Ӏ ȍ, - . |
˂ Dialectizer 킯 . ̹˂ . 禱 , Ղ , 삄 ˞ ˼ ۱ ˼ .
۱ ˼ ˉ ܁˂ ը 졒, , Ț 傄 ( е) վ. (ޡ ˼) ݲ ۱ ȍ , ˼ re ݵ˂ ܡ ﹚ ˂ 빚 տ ȓ.
ը˼ Ț (index, find, count), (replace), (split) ˤ 翫, һ ס굎˼ ˌ ۵ ˅. Ț ˂, - ը ̣, ס굎˼ ȗ -ը 롬; ը s ը 롬 Ț , ˼ s.lower() s.upper() ̉ 졒 ˂ Ț ը ը 낯 . replace split ˼ ς 翫. ˼ ˅ ס굎 ȍ (ס굎˼ ܱ ), һ ˉ 굎 ˤЫ, ˼ ۱ ҙ .
Example 4.18. Matching at the end of a string
˂ ϵ˼, ț ̸ 챉 ˟ ˂ ̸ ̉ 煹 Ђ졒 ϡ, ȡ ҫ -Ȕ˂ ը ǵ . (, һ ˜ ˍ ; ס˼ .)
>>> s = '100 NORTH MAIN ROAD' >>> s.replace('ROAD', 'RD.')'100 NORTH MAIN RD.' >>> s = '100 NORTH BROAD ROAD' >>> s.replace('ROAD', 'RD.')
'100 NORTH BRD. RD.' >>> s[:-4] + s[-4:].replace('ROAD', 'RD.')
'100 NORTH BROAD RD.' >>> import re
>>> re.sub('ROAD$', 'RD.', s)
![]()
'100 NORTH BROAD RD.'
Example 4.19. Matching whole words
>>> s = '100 BROAD' >>> re.sub('ROAD$', 'RD.', s)'100 BRD.' >>> re.sub('\\bROAD$', 'RD.', s)
'100 BROAD' >>> re.sub(r'\bROAD$', 'RD.', s)
'100 BROAD' >>> s = '100 BROAD ROAD APT. 3' >>> re.sub(r'\bROAD$', 'RD.', s)
'100 BROAD ROAD APT. 3' >>> re.sub(r'\bROAD\b', 'RD.', s)
'100 BROAD RD. APT 3'
˼ ۱ ˅ Ȑ˂ 졢 . ס˼ һ , ̴ ˟̹ ס ፫嵴 ̀翫. ס˼ ݵ ը ˼ ˼ . ˼ ס굎 ̾ Տ ס굎 ˞ 貮 졒, ס굎 ը ը 貮 .
|
ȍֵ˼ ը 忂, “һ 貮, һ ۱ ” ࡢ. ˼ ˂ ը 翫. | ||
| --Jamie Zawinski, in comp.lang.emacs | ||
ܲ
, ˼ ˌ˂ 皱 ݵ . http://diveintopython.org/ ղ ۸ ȸ ˅ .
[7] SGMLParser ˼ ˤ ˞ (consumer): ס˼ HTML ס ܂. ȗ, feed “ (consumer)” ˟̹ ˤ 嵵 ̵. ˞, ס˼ һ, ա, ˼ ˂ ˞ ˅ ա˂ ˟ ࡢһ , һ ӈӈ й ȓ ˼ ˂ ˜ ݁ ˅ Ҵ ˅, һ ˼ ˤ챉 ˜˼ ˂ ˸ ӡ ˅ ˤ낄, " ˟̹ ȍ˼ ˅ " 騸 Ȃ ˅ 삄 ̦ˌˤ˂ ˼, һ ǹ ˼ ˅ ը, “ (parser) .” һ ס һ 皵 . 妵, ס˼ ȗ (˸˂ ).
[8] ըм Ђ ը˼ 챉 ը. ˼ ̧ ̧졒 皂 ȸ챉 . ը˼ Ղ, s = s + newpiece ˼ ˂ ըС ț ը 禡 ¡븇 ˟ ț ը , ˂ ը , ˼ ˼ ˞ ՠ ՠ ґ˂ ˼ ը 盡, s = s + newpiece ȍ ˼ ˞. ˞ , n˂ ̧ ˼ O(n), ՠ n˂ ը ̧ ˼ O(n2).
[9] һ ˼ ݱނ .
[10] , ס˼ 鍂 ը˼ . ը˼ : “ 챉 ˤЫ һ ȍ ?” (: Emacs) ˼ “˼ һ˼ ?” (: “ Ҁ ȍֵ ס 챉 챉 ը.” -Larry Wall, 10/14/1998) һ HTML ը˼ տ ȗ ̉, ը , ˌ 뱉˅ ը.
| << 3 ˌ ̹皂 | 5 ˌ >> |