Python Parse HTML
How to parse HTML using Python and HTMLparser library.
--------------------------------------------------
#Output
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8" />
<title>Sample HTML Document</title>
<meta name="description" content="This is a sample HTML file" />
<meta name="author" content="Administrator" />
<meta name="viewport" content="width=device-width; initial-scale=1.0" />
<!-- Replace favicon.ico & apple-touch-icon.png in the root of your domain and delete these references -->
<link rel="shortcut icon" href="/favicon.ico" />
<link rel="apple-touch-icon" href="/apple-touch-icon.png" />
</head>
<body>
<div>
<header>
<h1>HTML Sample File</h1>
</header>
<nav>
<p>
<a href="/">Home</a>
</p>
<p>
<a href="/contact">Contact</a>
</p>
</nav>
<div>
</div>
<footer>
<p>© Copyright by Administrator</p>
</footer>
</div>
</body>
</html>
Encountered comment:
At line: 9 position 4
# # Python parse HTML # from html.parser import HTMLParser class AnHTMLParser(HTMLParser): def handle_comment(self, data): print("Encountered comment: ", data) pos = self.getpos() print("\tAt line: ", pos[0], " position ", pos[1]) def main(): # instantiate the parser and feed it some HTML parser = AnHTMLParser() f = open("samplehtml.html") if f.mode == 'r': contents = f.read() print(contents) parser.feed(contents) if __name__ == "__main__": main();
--------------------------------------------------
#Output
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8" />
<title>Sample HTML Document</title>
<meta name="description" content="This is a sample HTML file" />
<meta name="author" content="Administrator" />
<meta name="viewport" content="width=device-width; initial-scale=1.0" />
<!-- Replace favicon.ico & apple-touch-icon.png in the root of your domain and delete these references -->
<link rel="shortcut icon" href="/favicon.ico" />
<link rel="apple-touch-icon" href="/apple-touch-icon.png" />
</head>
<body>
<div>
<header>
<h1>HTML Sample File</h1>
</header>
<nav>
<p>
<a href="/">Home</a>
</p>
<p>
<a href="/contact">Contact</a>
</p>
</nav>
<div>
</div>
<footer>
<p>© Copyright by Administrator</p>
</footer>
</div>
</body>
</html>
Encountered comment:
At line: 9 position 4
Comments
Post a Comment