static VALUE new(int argc, VALUE *argv, VALUE klass) { VALUE uri, external_id, rest, rb_doc; htmlDocPtr doc; rb_scan_args(argc, argv, "0*", &rest); uri = rb_ary_entry(rest, (long)0); external_id = rb_ary_entry(rest, (long)1); doc = htmlNewDoc( RTEST(uri) ? (const xmlChar *)StringValueCStr(uri) : NULL, RTEST(external_id) ? (const xmlChar *)StringValueCStr(external_id) : NULL ); rb_doc = Nokogiri_wrap_xml_document(klass, doc); rb_obj_call_init(rb_doc, argc, argv); return rb_doc ; }
Create a new document
# File lib/nokogiri/html/document.rb, line 162 def parse string_or_io, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML options = Nokogiri::XML::ParseOptions.new(options) if Integer === options # Give the options to the user yield options if block_given? if string_or_io.respond_to?(:encoding) unless string_or_io.encoding.name == "ASCII-8BIT" encoding ||= string_or_io.encoding.name end end if string_or_io.respond_to?(:read) url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil unless encoding # Libxml2's parser has poor support for encoding # detection. First, it does not recognize the HTML5 # style meta charset declaration. Secondly, even if it # successfully detects an encoding hint, it does not # re-decode or re-parse the preceding part which may be # garbled. # # EncodingReader aims to perform advanced encoding # detection beyond what Libxml2 does, and to emulate # rewinding of a stream and make Libxml2 redo parsing # from the start when an encoding hint is found. string_or_io = EncodingReader.new(string_or_io) begin return read_io(string_or_io, url, encoding, options.to_i) rescue EncodingFound => e encoding = e.found_encoding end end return read_io(string_or_io, url, encoding, options.to_i) end # read_memory pukes on empty docs if string_or_io.nil? or string_or_io.empty? return encoding ? new.tap { |i| i.encoding = encoding } : new end encoding ||= EncodingReader.detect_encoding(string_or_io) read_memory(string_or_io, url, encoding, options.to_i) end
Parse HTML
. string_or_io
may be a String, or any object that responds to read and close such as an IO, or StringIO. url
is resource where this document is located. encoding
is the encoding that should be used when processing the document. options
is a number that sets options in the parser, such as Nokogiri::XML::ParseOptions::RECOVER. See the constants in Nokogiri::XML::ParseOptions
.
static VALUE read_io( VALUE klass, VALUE io, VALUE url, VALUE encoding, VALUE options ) { const char * c_url = NIL_P(url) ? NULL : StringValueCStr(url); const char * c_enc = NIL_P(encoding) ? NULL : StringValueCStr(encoding); VALUE error_list = rb_ary_new(); VALUE document; htmlDocPtr doc; xmlResetLastError(); xmlSetStructuredErrorFunc((void *)error_list, Nokogiri_error_array_pusher); doc = htmlReadIO( io_read_callback, io_close_callback, (void *)io, c_url, c_enc, (int)NUM2INT(options) ); xmlSetStructuredErrorFunc(NULL, NULL); /* * If EncodingFound has occurred in EncodingReader, make sure to do * a cleanup and propagate the error. */ if (rb_respond_to(io, id_encoding_found)) { VALUE encoding_found = rb_funcall(io, id_encoding_found, 0); if (!NIL_P(encoding_found)) { xmlFreeDoc(doc); rb_exc_raise(encoding_found); } } if(doc == NULL) { xmlErrorPtr error; xmlFreeDoc(doc); error = xmlGetLastError(); if(error) rb_exc_raise(Nokogiri_wrap_xml_syntax_error(error)); else rb_raise(rb_eRuntimeError, "Could not parse document"); return Qnil; } document = Nokogiri_wrap_xml_document(klass, doc); rb_iv_set(document, "@errors", error_list); return document; }
Read the HTML
document from io
with given url
, encoding
, and options
. See Nokogiri::HTML.parse
static VALUE read_memory( VALUE klass, VALUE string, VALUE url, VALUE encoding, VALUE options ) { const char * c_buffer = StringValuePtr(string); const char * c_url = NIL_P(url) ? NULL : StringValueCStr(url); const char * c_enc = NIL_P(encoding) ? NULL : StringValueCStr(encoding); int len = (int)RSTRING_LEN(string); VALUE error_list = rb_ary_new(); VALUE document; htmlDocPtr doc; xmlResetLastError(); xmlSetStructuredErrorFunc((void *)error_list, Nokogiri_error_array_pusher); doc = htmlReadMemory(c_buffer, len, c_url, c_enc, (int)NUM2INT(options)); xmlSetStructuredErrorFunc(NULL, NULL); if(doc == NULL) { xmlErrorPtr error; xmlFreeDoc(doc); error = xmlGetLastError(); if(error) rb_exc_raise(Nokogiri_wrap_xml_syntax_error(error)); else rb_raise(rb_eRuntimeError, "Could not parse document"); return Qnil; } document = Nokogiri_wrap_xml_document(klass, doc); rb_iv_set(document, "@errors", error_list); return document; }
Read the HTML
document contained in string
with given url
, encoding
, and options
. See Nokogiri::HTML.parse
# File lib/nokogiri/html/document.rb, line 149 def fragment tags = nil DocumentFragment.new(self, tags, self.root) end
Create a Nokogiri::XML::DocumentFragment
from tags
# File lib/nokogiri/html/document.rb, line 7 def meta_encoding case when meta = at('//meta[@charset]') meta[:charset] when meta = meta_content_type meta['content'][/charset\s*=\s*([\w-]+)/i, 1] end end
Get the meta tag encoding for this document. If there is no meta tag, then nil is returned.
# File lib/nokogiri/html/document.rb, line 32 def meta_encoding= encoding case when meta = meta_content_type meta['content'] = 'text/html; charset=%s' % encoding encoding when meta = at('//meta[@charset]') meta['charset'] = encoding else meta = XML::Node.new('meta', self) if dtd = internal_subset and dtd.html5_dtd? meta['charset'] = encoding else meta['http-equiv'] = 'Content-Type' meta['content'] = 'text/html; charset=%s' % encoding end case when head = at('//head') head.prepend_child(meta) else set_metadata_element(meta) end encoding end end
Set the meta tag encoding for this document.
If an meta encoding tag is already present, its content is replaced with the given text.
Otherwise, this method tries to create one at an appropriate place supplying head and/or html elements as necessary, which is inside a head element if any, and before any text node or content element (typically <body>) if any.
The result when trying to set an encoding that is different from the document encoding is undefined.
Beware in CRuby, that libxml2 automatically inserts a meta tag into a head element.
# File lib/nokogiri/html/document.rb, line 142 def serialize options = {} options[:save_with] ||= XML::Node::SaveOptions::DEFAULT_HTML super end
Serialize Node using options
. Save options can also be set using a block. See SaveOptions.
These two statements are equivalent:
node.serialize(:encoding => 'UTF-8', :save_with => FORMAT | AS_XML)
or
node.serialize(:encoding => 'UTF-8') do |config| config.format.as_xml end
Nokogiri::XML::Node#serialize
# File lib/nokogiri/html/document.rb, line 68 def title title = at('//title') and title.inner_text end
Get the title string of this document. Return nil if there is no title tag.
# File lib/nokogiri/html/document.rb, line 83 def title=(text) tnode = XML::Text.new(text, self) if title = at('//title') title.children = tnode return text end title = XML::Node.new('title', self) << tnode case when head = at('//head') head << title when meta = at('//meta[@charset]') || meta_content_type # better put after charset declaration meta.add_next_sibling(title) else set_metadata_element(title) end text end
Set the title string of this document.
If a title element is already present, its content is replaced with the given text.
Otherwise, this method tries to create one at an appropriate place supplying head and/or html elements as necessary, which is inside a head element if any, right after a meta encoding/charset tag if any, and before any text node or content element (typically <body>) if any.
static VALUE type(VALUE self) { htmlDocPtr doc; Data_Get_Struct(self, xmlDoc, doc); return INT2NUM((long)doc->type); }
The type for this document
© 2008–2018 Aaron Patterson, Mike Dalessio, Charles Nutter, Sergio Arbeo,
Patrick Mahoney, Yoko Harada, Akinori MUSHA, John Shahid, Lars Kanis
Licensed under the MIT License.