class simple_html_dom

  1. cis7 sites/all/modules/ulmus/simplehtmldom/simplehtmldom/simple_html_dom.php simple_html_dom
  2. cle7 sites/all/modules/ulmus/simplehtmldom/simplehtmldom/simple_html_dom.php simple_html_dom
  3. elmsmedia7 sites/all/modules/ulmus/simplehtmldom/simplehtmldom/simple_html_dom.php simple_html_dom
  4. icor7 sites/all/modules/ulmus/simplehtmldom/simplehtmldom/simple_html_dom.php simple_html_dom
  5. meedjum_blog7 sites/all/modules/ulmus/simplehtmldom/simplehtmldom/simple_html_dom.php simple_html_dom
  6. mooc7 sites/all/modules/ulmus/simplehtmldom/simplehtmldom/simple_html_dom.php simple_html_dom

Hierarchy

Expanded class hierarchy of simple_html_dom

Members

Contains filters are case sensitive
Namesort descending Modifiers Type Description
simple_html_dom::$block_tags protected property
simple_html_dom::$callback public property
simple_html_dom::$char protected property
simple_html_dom::$cursor protected property
simple_html_dom::$doc protected property
simple_html_dom::$lowercase public property
simple_html_dom::$nodes public property
simple_html_dom::$noise protected property
simple_html_dom::$optional_closing_tags protected property
simple_html_dom::$parent protected property
simple_html_dom::$pos protected property
simple_html_dom::$root public property
simple_html_dom::$self_closing_tags protected property
simple_html_dom::$size protected property
simple_html_dom::$token_attr protected property
simple_html_dom::$token_blank protected property
simple_html_dom::$token_equal protected property
simple_html_dom::$token_slash protected property
simple_html_dom::as_text_node protected function
simple_html_dom::childNodes function
simple_html_dom::clear function
simple_html_dom::copy_skip protected function
simple_html_dom::copy_until protected function
simple_html_dom::copy_until_char protected function
simple_html_dom::copy_until_char_escape protected function
simple_html_dom::dump function
simple_html_dom::find function
simple_html_dom::firstChild function
simple_html_dom::getElementById function
simple_html_dom::getElementByTagName function
simple_html_dom::getElementsById function
simple_html_dom::getElementsByTagName function
simple_html_dom::lastChild function
simple_html_dom::link_nodes protected function
simple_html_dom::load function
simple_html_dom::loadFile function
simple_html_dom::load_file function
simple_html_dom::parse protected function
simple_html_dom::parse_attr protected function
simple_html_dom::prepare protected function
simple_html_dom::read_tag protected function
simple_html_dom::remove_callback function
simple_html_dom::remove_noise protected function
simple_html_dom::restore_noise function
simple_html_dom::save function
simple_html_dom::set_callback function
simple_html_dom::skip protected function
simple_html_dom::__construct function
simple_html_dom::__destruct function
simple_html_dom::__get function
simple_html_dom::__toString function

File

sites/all/modules/ulmus/simplehtmldom/simplehtmldom/simple_html_dom.php, line 481

View source
class simple_html_dom {
  public $root = null;
  public $nodes = array();
  public $callback = null;
  public $lowercase = false;
  protected $pos;
  protected $doc;
  protected $char;
  protected $size;
  protected $cursor;
  protected $parent;
  protected $noise = array();
  protected $token_blank = " \t\r\n";
  protected $token_equal = ' =/>';
  protected $token_slash = " />\r\n\t";
  protected $token_attr = ' >';
  // use isset instead of in_array, performance boost about 30%...
  protected $self_closing_tags = array(
    'img' => 1,
    'br' => 1,
    'input' => 1,
    'meta' => 1,
    'link' => 1,
    'hr' => 1,
    'base' => 1,
    'embed' => 1,
    'spacer' => 1,
  );
  protected $block_tags = array(
    'root' => 1,
    'body' => 1,
    'form' => 1,
    'div' => 1,
    'span' => 1,
    'table' => 1,
  );
  protected $optional_closing_tags = array(
    'tr' => array(
      'tr' => 1,
      'td' => 1,
      'th' => 1,
    ),
    'th' => array('th' => 1),
    'td' => array('td' => 1),
    'li' => array('li' => 1),
    'dt' => array(
      'dt' => 1,
      'dd' => 1,
    ),
    'dd' => array(
      'dd' => 1,
      'dt' => 1,
    ),
    'dl' => array(
      'dd' => 1,
      'dt' => 1,
    ),
    'p' => array('p' => 1),
    'nobr' => array('nobr' => 1),
  );

  function __construct($str = null) {
    if ($str) {
      if (preg_match("/^http:\/\//i", $str) || is_file($str)) {
        $this->load_file($str);
      }
      else {
        $this->load($str);
      }
    }
  }

  function __destruct() {
    $this->clear();
  }

  // load html from string
  function load($str, $lowercase = true) {
    // prepare
    $this->prepare($str, $lowercase);
    // strip out comments
    $this->remove_noise("'<!--(.*?)-->'is");
    // strip out cdata
    $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true);
    // strip out <style> tags
    $this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is");
    $this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is");
    // strip out <script> tags
    $this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is");
    $this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is");
    // strip out preformatted tags
    $this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is");
    // strip out server side scripts
    $this->remove_noise("'(<\?)(.*?)(\?>)'s", true);
    // strip smarty scripts
    $this->remove_noise("'(\{\w)(.*?)(\})'s", true);

    // parsing
    while ($this->parse());
    // end
    $this->root->_[HDOM_INFO_END] = $this->cursor;
  }

  // load html from file
  function load_file() {
    $args = func_get_args();
    $this->load(call_user_func_array('file_get_contents', $args), true);
  }

  // set callback function
  function set_callback($function_name) {
    $this->callback = $function_name;
  }

  // remove callback function
  function remove_callback() {
    $this->callback = null;
  }

  // save dom as string
  function save($filepath = '') {
    $ret = $this->root->innertext();
    if ($filepath !== '') {
      file_put_contents($filepath, $ret);
    }
    return $ret;
  }

  // find dom node by css selector
  function find($selector, $idx = null) {
    return $this->root->find($selector, $idx);
  }

  // clean up memory due to php5 circular references memory leak...
  function clear() {
    foreach ($this->nodes as $n) {
      $n->clear();
      $n = null;
    }
    if (isset($this->parent)) {
      $this->parent->clear();
      unset($this->parent);
    }
    if (isset($this->root)) {
      $this->root->clear();
      unset($this->root);
    }
    unset($this->doc);
    unset($this->noise);
  }

  function dump($show_attr = true) {
    $this->root->dump($show_attr);
  }

  // prepare HTML data and init everything
  protected function prepare($str, $lowercase = true) {
    $this->clear();
    $this->doc = $str;
    $this->pos = 0;
    $this->cursor = 1;
    $this->noise = array();
    $this->nodes = array();
    $this->lowercase = $lowercase;
    $this->root = new simple_html_dom_node($this);
    $this->root->tag = 'root';
    $this->root->_[HDOM_INFO_BEGIN] = -1;
    $this->root->nodetype = HDOM_TYPE_ROOT;
    $this->parent = $this->root;
    // set the length of content
    $this->size = strlen($str);
    if ($this->size > 0) {
      $this->char = $this->doc[0];
    }
  }

  // parse html content
  protected function parse() {
    if (($s = $this->copy_until_char('<')) === '') {
      return $this->read_tag();
    }

    // text
    $node = new simple_html_dom_node($this);
    ++$this->cursor;
    $node->_[HDOM_INFO_TEXT] = $s;
    $this->link_nodes($node, false);
    return true;
  }

  // read tag info
  protected function read_tag() {
    if ($this->char !== '<') {
      $this->root->_[HDOM_INFO_END] = $this->cursor;
      return false;
    }
    $begin_tag_pos = $this->pos;
    $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next

    // end tag
    if ($this->char === '/') {
      $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
      $this->skip($this->token_blank_t);
      $tag = $this->copy_until_char('>');

      // skip attributes in end tag
      if (($pos = strpos($tag, ' ')) !== false) {
        $tag = substr($tag, 0, $pos);
      }

      $parent_lower = strtolower($this->parent->tag);
      $tag_lower = strtolower($tag);

      if ($parent_lower !== $tag_lower) {
        if (isset($this->optional_closing_tags[$parent_lower]) && isset($this->block_tags[$tag_lower])) {
          $this->parent->_[HDOM_INFO_END] = 0;
          $org_parent = $this->parent;

          while (($this->parent->parent) && strtolower($this->parent->tag) !== $tag_lower) {
            $this->parent = $this->parent->parent;
          }

          if (strtolower($this->parent->tag) !== $tag_lower) {
            $this->parent = $org_parent; // restore origonal parent
            if ($this->parent->parent) {
              $this->parent = $this->parent->parent;
            }
            $this->parent->_[HDOM_INFO_END] = $this->cursor;
            return $this->as_text_node($tag);
          }
        }
        else if (($this->parent->parent) && isset($this->block_tags[$tag_lower])) {
          $this->parent->_[HDOM_INFO_END] = 0;
          $org_parent = $this->parent;

          while (($this->parent->parent) && strtolower($this->parent->tag) !== $tag_lower) {
            $this->parent = $this->parent->parent;
          }

          if (strtolower($this->parent->tag) !== $tag_lower) {
            $this->parent = $org_parent; // restore origonal parent
            $this->parent->_[HDOM_INFO_END] = $this->cursor;
            return $this->as_text_node($tag);
          }
        }
        else if (($this->parent->parent) && strtolower($this->parent->parent->tag) === $tag_lower) {
          $this->parent->_[HDOM_INFO_END] = 0;
          $this->parent = $this->parent->parent;
        }
        else {
          return $this->as_text_node($tag);
        }
      }

      $this->parent->_[HDOM_INFO_END] = $this->cursor;
      if ($this->parent->parent) {
        $this->parent = $this->parent->parent;
      }

      $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
      return true;
    }

    $node = new simple_html_dom_node($this);
    $node->_[HDOM_INFO_BEGIN] = $this->cursor;
    ++$this->cursor;
    $tag = $this->copy_until($this->token_slash);

    // doctype, cdata & comments...
    if (isset($tag[0]) && $tag[0] === '!') {
      $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>');

      if (isset($tag[2]) && $tag[1] === '-' && $tag[2] === '-') {
        $node->nodetype = HDOM_TYPE_COMMENT;
        $node->tag = 'comment';
      }
      else {
        $node->nodetype = HDOM_TYPE_UNKNOWN;
        $node->tag = 'unknown';
      }

      if ($this->char === '>') {
        $node->_[HDOM_INFO_TEXT] .= '>';
      }
      $this->link_nodes($node, true);
      $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
      return true;
    }

    // text
    if ($pos = strpos($tag, '<') !== false) {
      $tag = '<' . substr($tag, 0, -1);
      $node->_[HDOM_INFO_TEXT] = $tag;
      $this->link_nodes($node, false);
      $this->char = $this->doc[--$this->pos]; // prev
      return true;
    }

    if (!preg_match("/^[\w-:]+$/", $tag)) {
      $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>');
      if ($this->char === '<') {
        $this->link_nodes($node, false);
        return true;
      }

      if ($this->char === '>') {
        $node->_[HDOM_INFO_TEXT] .= '>';
      }
      $this->link_nodes($node, false);
      $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
      return true;
    }

    // begin tag
    $node->nodetype = HDOM_TYPE_ELEMENT;
    $tag_lower = strtolower($tag);
    $node->tag = ($this->lowercase) ? $tag_lower : $tag;

    // handle optional closing tags
    if (isset($this->optional_closing_tags[$tag_lower])) {
      while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) {
        $this->parent->_[HDOM_INFO_END] = 0;
        $this->parent = $this->parent->parent;
      }
      $node->parent = $this->parent;
    }

    $guard = 0; // prevent infinity loop
    $space = array($this->copy_skip($this->token_blank), '', '');

    // attributes
    do {
      if ($this->char !== null && $space[0] === '') {
        break;
      }
      $name = $this->copy_until($this->token_equal);
      if ($guard === $this->pos) {
        $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
        continue;
      }
      $guard = $this->pos;

      // handle endless '<'
      if ($this->pos >= $this->size -1 && $this->char !== '>') {
        $node->nodetype = HDOM_TYPE_TEXT;
        $node->_[HDOM_INFO_END] = 0;
        $node->_[HDOM_INFO_TEXT] = '<' . $tag . $space[0] . $name;
        $node->tag = 'text';
        $this->link_nodes($node, false);
        return true;
      }

      // handle mismatch '<'
      if ($this->doc[$this->pos -1] == '<') {
        $node->nodetype = HDOM_TYPE_TEXT;
        $node->tag = 'text';
        $node->attr = array();
        $node->_[HDOM_INFO_END] = 0;
        $node->_[HDOM_INFO_TEXT] = substr($this->doc, $begin_tag_pos, $this->pos -$begin_tag_pos -1);
        $this->pos -= 2;
        $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
        $this->link_nodes($node, false);
        return true;
      }

      if ($name !== '/' && $name !== '') {
        $space[1] = $this->copy_skip($this->token_blank);
        $name = $this->restore_noise($name);
        if ($this->lowercase) {
          $name = strtolower($name);
        }
        if ($this->char === '=') {
          $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
          $this->parse_attr($node, $name, $space);
        }
        else {
          //no value attr: nowrap, checked selected...
          $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
          $node->attr[$name] = true;
          if ($this->char != '>') {
            $this->char = $this->doc[--$this->pos]; // prev
          }
        }
        $node->_[HDOM_INFO_SPACE][] = $space;
        $space = array($this->copy_skip($this->token_blank), '', '');
      }
      else {
        break;
      }
    } while ($this->char !== '>' && $this->char !== '/');

    $this->link_nodes($node, true);
    $node->_[HDOM_INFO_ENDSPACE] = $space[0];

    // check self closing
    if ($this->copy_until_char_escape('>') === '/') {
      $node->_[HDOM_INFO_ENDSPACE] .= '/';
      $node->_[HDOM_INFO_END] = 0;
    }
    else {
      // reset parent
      if (!isset($this->self_closing_tags[strtolower($node->tag)])) {
        $this->parent = $node;
      }
    }
    $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
    return true;
  }

  // parse attributes
  protected function parse_attr($node, $name, &$space) {
    $space[2] = $this->copy_skip($this->token_blank);
    switch ($this->char) {
      case '"':
        $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
        $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
        $node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('"'));
        $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
        break;
      case '\'':
        $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_SINGLE;
        $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
        $node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('\''));
        $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
        break;
      default:
        $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
        $node->attr[$name] = $this->restore_noise($this->copy_until($this->token_attr));
    }
  }

  // link node's parent
  protected function link_nodes(&$node, $is_child) {
    $node->parent = $this->parent;
    $this->parent->nodes[] = $node;
    if ($is_child) {
      $this->parent->children[] = $node;
    }
  }

  // as a text node
  protected function as_text_node($tag) {
    $node = new simple_html_dom_node($this);
    ++$this->cursor;
    $node->_[HDOM_INFO_TEXT] = '</' . $tag . '>';
    $this->link_nodes($node, false);
    $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
    return true;
  }

  protected function skip($chars) {
    $this->pos += strspn($this->doc, $chars, $this->pos);
    $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  }

  protected function copy_skip($chars) {
    $pos = $this->pos;
    $len = strspn($this->doc, $chars, $pos);
    $this->pos += $len;
    $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
    if ($len === 0) {
      return '';
    }
    return substr($this->doc, $pos, $len);
  }

  protected function copy_until($chars) {
    $pos = $this->pos;
    $len = strcspn($this->doc, $chars, $pos);
    $this->pos += $len;
    $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
    return substr($this->doc, $pos, $len);
  }

  protected function copy_until_char($char) {
    if ($this->char === null) {
      return '';
    }

    if (($pos = strpos($this->doc, $char, $this->pos)) === false) {
      $ret = substr($this->doc, $this->pos, $this->size -$this->pos);
      $this->char = null;
      $this->pos = $this->size;
      return $ret;
    }

    if ($pos === $this->pos) {
      return '';
    }
    $pos_old = $this->pos;
    $this->char = $this->doc[$pos];
    $this->pos = $pos;
    return substr($this->doc, $pos_old, $pos -$pos_old);
  }

  protected function copy_until_char_escape($char) {
    if ($this->char === null) {
      return '';
    }

    $start = $this->pos;
    while (1) {
      if (($pos = strpos($this->doc, $char, $start)) === false) {
        $ret = substr($this->doc, $this->pos, $this->size -$this->pos);
        $this->char = null;
        $this->pos = $this->size;
        return $ret;
      }

      if ($pos === $this->pos) {
        return '';
      }

      if ($this->doc[$pos -1] === '\\') {
        $start = $pos + 1;
        continue;
      }

      $pos_old = $this->pos;
      $this->char = $this->doc[$pos];
      $this->pos = $pos;
      return substr($this->doc, $pos_old, $pos -$pos_old);
    }
  }

  // remove noise from html content
  protected function remove_noise($pattern, $remove_tag = false) {
    $count = preg_match_all($pattern, $this->doc, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);

    for ($i = $count -1; $i > -1; --$i) {
      $key = '___noise___' . sprintf('% 3d', count($this->noise) + 100);
      $idx = ($remove_tag) ? 0 : 1;
      $this->noise[$key] = $matches[$i][$idx][0];
      $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0]));
    }

    // reset the length of content
    $this->size = strlen($this->doc);
    if ($this->size > 0) {
      $this->char = $this->doc[0];
    }
  }

  // restore noise to html content
  function restore_noise($text) {
    while (($pos = strpos($text, '___noise___')) !== false) {
      $key = '___noise___' . $text[$pos + 11] . $text[$pos + 12] . $text[$pos + 13];
      if (isset($this->noise[$key])) {
        $text = substr($text, 0, $pos) . $this->noise[$key] . substr($text, $pos + 14);
      }
    }
    return $text;
  }

  function __toString() {
    return $this->root->innertext();
  }

  function __get($name) {
    switch ($name) {
      case 'outertext':
        return $this->root->innertext();
      case 'innertext':
        return $this->root->innertext();
      case 'plaintext':
        return $this->root->text();
    }
  }

  // camel naming conventions
  function childNodes($idx = -1) {
    return $this->root->childNodes($idx);
  }
  function firstChild() {
    return $this->root->first_child();
  }
  function lastChild() {
    return $this->root->last_child();
  }
  function getElementById($id) {
    return $this->find("#$id", 0);
  }
  function getElementsById($id, $idx = null) {
    return $this->find("#$id", $idx);
  }
  function getElementByTagName($name) {
    return $this->find($name, 0);
  }
  function getElementsByTagName($name, $idx = -1) {
    return $this->find($name, $idx);
  }
  function loadFile() {
    $args = func_get_args();
    $this->load(call_user_func_array('file_get_contents', $args), true);
  }
}
Error | ELMSLN API

Error

×

Error message

  • Warning: Cannot modify header information - headers already sent by (output started at /var/www/html/elmsln_community/api.elmsln.org/includes/common.inc:2791) in drupal_send_headers() (line 1499 of /var/www/html/elmsln_community/api.elmsln.org/includes/bootstrap.inc).
  • Error: Call to undefined function apc_delete() in DrupalAPCCache->clear() (line 289 of /var/www/html/elmsln_community/api.elmsln.org/sites/all/modules/apc/drupal_apc_cache.inc).
The website encountered an unexpected error. Please try again later.