Crawler.php 38 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267
  1. <?php
  2. /*
  3. * This file is part of the Symfony package.
  4. *
  5. * (c) Fabien Potencier <fabien@symfony.com>
  6. *
  7. * For the full copyright and license information, please view the LICENSE
  8. * file that was distributed with this source code.
  9. */
  10. namespace Symfony\Component\DomCrawler;
  11. use Masterminds\HTML5;
  12. use Symfony\Component\CssSelector\CssSelectorConverter;
  13. /**
  14. * Crawler eases navigation of a list of \DOMNode objects.
  15. *
  16. * @author Fabien Potencier <fabien@symfony.com>
  17. */
  18. class Crawler implements \Countable, \IteratorAggregate
  19. {
  20. protected $uri;
  21. /**
  22. * @var string The default namespace prefix to be used with XPath and CSS expressions
  23. */
  24. private $defaultNamespacePrefix = 'default';
  25. /**
  26. * @var array A map of manually registered namespaces
  27. */
  28. private $namespaces = [];
  29. /**
  30. * @var string The base href value
  31. */
  32. private $baseHref;
  33. /**
  34. * @var \DOMDocument|null
  35. */
  36. private $document;
  37. /**
  38. * @var \DOMNode[]
  39. */
  40. private $nodes = [];
  41. /**
  42. * Whether the Crawler contains HTML or XML content (used when converting CSS to XPath).
  43. *
  44. * @var bool
  45. */
  46. private $isHtml = true;
  47. /**
  48. * @var HTML5|null
  49. */
  50. private $html5Parser;
  51. /**
  52. * @param \DOMNodeList|\DOMNode|\DOMNode[]|string|null $node A Node to use as the base for the crawling
  53. */
  54. public function __construct($node = null, string $uri = null, string $baseHref = null)
  55. {
  56. $this->uri = $uri;
  57. $this->baseHref = $baseHref ?: $uri;
  58. $this->html5Parser = class_exists(HTML5::class) ? new HTML5(['disable_html_ns' => true]) : null;
  59. $this->add($node);
  60. }
  61. /**
  62. * Returns the current URI.
  63. *
  64. * @return string
  65. */
  66. public function getUri()
  67. {
  68. return $this->uri;
  69. }
  70. /**
  71. * Returns base href.
  72. *
  73. * @return string
  74. */
  75. public function getBaseHref()
  76. {
  77. return $this->baseHref;
  78. }
  79. /**
  80. * Removes all the nodes.
  81. */
  82. public function clear()
  83. {
  84. $this->nodes = [];
  85. $this->document = null;
  86. }
  87. /**
  88. * Adds a node to the current list of nodes.
  89. *
  90. * This method uses the appropriate specialized add*() method based
  91. * on the type of the argument.
  92. *
  93. * @param \DOMNodeList|\DOMNode|\DOMNode[]|string|null $node A node
  94. *
  95. * @throws \InvalidArgumentException when node is not the expected type
  96. */
  97. public function add($node)
  98. {
  99. if ($node instanceof \DOMNodeList) {
  100. $this->addNodeList($node);
  101. } elseif ($node instanceof \DOMNode) {
  102. $this->addNode($node);
  103. } elseif (\is_array($node)) {
  104. $this->addNodes($node);
  105. } elseif (\is_string($node)) {
  106. $this->addContent($node);
  107. } elseif (null !== $node) {
  108. throw new \InvalidArgumentException(sprintf('Expecting a DOMNodeList or DOMNode instance, an array, a string, or null, but got "%s".', get_debug_type($node)));
  109. }
  110. }
  111. /**
  112. * Adds HTML/XML content.
  113. *
  114. * If the charset is not set via the content type, it is assumed to be UTF-8,
  115. * or ISO-8859-1 as a fallback, which is the default charset defined by the
  116. * HTTP 1.1 specification.
  117. */
  118. public function addContent(string $content, string $type = null)
  119. {
  120. if (empty($type)) {
  121. $type = 0 === strpos($content, '<?xml') ? 'application/xml' : 'text/html';
  122. }
  123. // DOM only for HTML/XML content
  124. if (!preg_match('/(x|ht)ml/i', $type, $xmlMatches)) {
  125. return;
  126. }
  127. $charset = null;
  128. if (false !== $pos = stripos($type, 'charset=')) {
  129. $charset = substr($type, $pos + 8);
  130. if (false !== $pos = strpos($charset, ';')) {
  131. $charset = substr($charset, 0, $pos);
  132. }
  133. }
  134. // http://www.w3.org/TR/encoding/#encodings
  135. // http://www.w3.org/TR/REC-xml/#NT-EncName
  136. if (null === $charset &&
  137. preg_match('/\<meta[^\>]+charset *= *["\']?([a-zA-Z\-0-9_:.]+)/i', $content, $matches)) {
  138. $charset = $matches[1];
  139. }
  140. if (null === $charset) {
  141. $charset = preg_match('//u', $content) ? 'UTF-8' : 'ISO-8859-1';
  142. }
  143. if ('x' === $xmlMatches[1]) {
  144. $this->addXmlContent($content, $charset);
  145. } else {
  146. $this->addHtmlContent($content, $charset);
  147. }
  148. }
  149. /**
  150. * Adds an HTML content to the list of nodes.
  151. *
  152. * The libxml errors are disabled when the content is parsed.
  153. *
  154. * If you want to get parsing errors, be sure to enable
  155. * internal errors via libxml_use_internal_errors(true)
  156. * and then, get the errors via libxml_get_errors(). Be
  157. * sure to clear errors with libxml_clear_errors() afterward.
  158. */
  159. public function addHtmlContent(string $content, string $charset = 'UTF-8')
  160. {
  161. $dom = $this->parseHtmlString($content, $charset);
  162. $this->addDocument($dom);
  163. $base = $this->filterRelativeXPath('descendant-or-self::base')->extract(['href']);
  164. $baseHref = current($base);
  165. if (\count($base) && !empty($baseHref)) {
  166. if ($this->baseHref) {
  167. $linkNode = $dom->createElement('a');
  168. $linkNode->setAttribute('href', $baseHref);
  169. $link = new Link($linkNode, $this->baseHref);
  170. $this->baseHref = $link->getUri();
  171. } else {
  172. $this->baseHref = $baseHref;
  173. }
  174. }
  175. }
  176. /**
  177. * Adds an XML content to the list of nodes.
  178. *
  179. * The libxml errors are disabled when the content is parsed.
  180. *
  181. * If you want to get parsing errors, be sure to enable
  182. * internal errors via libxml_use_internal_errors(true)
  183. * and then, get the errors via libxml_get_errors(). Be
  184. * sure to clear errors with libxml_clear_errors() afterward.
  185. *
  186. * @param int $options Bitwise OR of the libxml option constants
  187. * LIBXML_PARSEHUGE is dangerous, see
  188. * http://symfony.com/blog/security-release-symfony-2-0-17-released
  189. */
  190. public function addXmlContent(string $content, string $charset = 'UTF-8', int $options = \LIBXML_NONET)
  191. {
  192. // remove the default namespace if it's the only namespace to make XPath expressions simpler
  193. if (!preg_match('/xmlns:/', $content)) {
  194. $content = str_replace('xmlns', 'ns', $content);
  195. }
  196. $internalErrors = libxml_use_internal_errors(true);
  197. if (\LIBXML_VERSION < 20900) {
  198. $disableEntities = libxml_disable_entity_loader(true);
  199. }
  200. $dom = new \DOMDocument('1.0', $charset);
  201. $dom->validateOnParse = true;
  202. if ('' !== trim($content)) {
  203. @$dom->loadXML($content, $options);
  204. }
  205. libxml_use_internal_errors($internalErrors);
  206. if (\LIBXML_VERSION < 20900) {
  207. libxml_disable_entity_loader($disableEntities);
  208. }
  209. $this->addDocument($dom);
  210. $this->isHtml = false;
  211. }
  212. /**
  213. * Adds a \DOMDocument to the list of nodes.
  214. *
  215. * @param \DOMDocument $dom A \DOMDocument instance
  216. */
  217. public function addDocument(\DOMDocument $dom)
  218. {
  219. if ($dom->documentElement) {
  220. $this->addNode($dom->documentElement);
  221. }
  222. }
  223. /**
  224. * Adds a \DOMNodeList to the list of nodes.
  225. *
  226. * @param \DOMNodeList $nodes A \DOMNodeList instance
  227. */
  228. public function addNodeList(\DOMNodeList $nodes)
  229. {
  230. foreach ($nodes as $node) {
  231. if ($node instanceof \DOMNode) {
  232. $this->addNode($node);
  233. }
  234. }
  235. }
  236. /**
  237. * Adds an array of \DOMNode instances to the list of nodes.
  238. *
  239. * @param \DOMNode[] $nodes An array of \DOMNode instances
  240. */
  241. public function addNodes(array $nodes)
  242. {
  243. foreach ($nodes as $node) {
  244. $this->add($node);
  245. }
  246. }
  247. /**
  248. * Adds a \DOMNode instance to the list of nodes.
  249. *
  250. * @param \DOMNode $node A \DOMNode instance
  251. */
  252. public function addNode(\DOMNode $node)
  253. {
  254. if ($node instanceof \DOMDocument) {
  255. $node = $node->documentElement;
  256. }
  257. if (null !== $this->document && $this->document !== $node->ownerDocument) {
  258. throw new \InvalidArgumentException('Attaching DOM nodes from multiple documents in the same crawler is forbidden.');
  259. }
  260. if (null === $this->document) {
  261. $this->document = $node->ownerDocument;
  262. }
  263. // Don't add duplicate nodes in the Crawler
  264. if (\in_array($node, $this->nodes, true)) {
  265. return;
  266. }
  267. $this->nodes[] = $node;
  268. }
  269. /**
  270. * Returns a node given its position in the node list.
  271. *
  272. * @return static
  273. */
  274. public function eq(int $position)
  275. {
  276. if (isset($this->nodes[$position])) {
  277. return $this->createSubCrawler($this->nodes[$position]);
  278. }
  279. return $this->createSubCrawler(null);
  280. }
  281. /**
  282. * Calls an anonymous function on each node of the list.
  283. *
  284. * The anonymous function receives the position and the node wrapped
  285. * in a Crawler instance as arguments.
  286. *
  287. * Example:
  288. *
  289. * $crawler->filter('h1')->each(function ($node, $i) {
  290. * return $node->text();
  291. * });
  292. *
  293. * @param \Closure $closure An anonymous function
  294. *
  295. * @return array An array of values returned by the anonymous function
  296. */
  297. public function each(\Closure $closure)
  298. {
  299. $data = [];
  300. foreach ($this->nodes as $i => $node) {
  301. $data[] = $closure($this->createSubCrawler($node), $i);
  302. }
  303. return $data;
  304. }
  305. /**
  306. * Slices the list of nodes by $offset and $length.
  307. *
  308. * @return static
  309. */
  310. public function slice(int $offset = 0, int $length = null)
  311. {
  312. return $this->createSubCrawler(\array_slice($this->nodes, $offset, $length));
  313. }
  314. /**
  315. * Reduces the list of nodes by calling an anonymous function.
  316. *
  317. * To remove a node from the list, the anonymous function must return false.
  318. *
  319. * @param \Closure $closure An anonymous function
  320. *
  321. * @return static
  322. */
  323. public function reduce(\Closure $closure)
  324. {
  325. $nodes = [];
  326. foreach ($this->nodes as $i => $node) {
  327. if (false !== $closure($this->createSubCrawler($node), $i)) {
  328. $nodes[] = $node;
  329. }
  330. }
  331. return $this->createSubCrawler($nodes);
  332. }
  333. /**
  334. * Returns the first node of the current selection.
  335. *
  336. * @return static
  337. */
  338. public function first()
  339. {
  340. return $this->eq(0);
  341. }
  342. /**
  343. * Returns the last node of the current selection.
  344. *
  345. * @return static
  346. */
  347. public function last()
  348. {
  349. return $this->eq(\count($this->nodes) - 1);
  350. }
  351. /**
  352. * Returns the siblings nodes of the current selection.
  353. *
  354. * @return static
  355. *
  356. * @throws \InvalidArgumentException When current node is empty
  357. */
  358. public function siblings()
  359. {
  360. if (!$this->nodes) {
  361. throw new \InvalidArgumentException('The current node list is empty.');
  362. }
  363. return $this->createSubCrawler($this->sibling($this->getNode(0)->parentNode->firstChild));
  364. }
  365. public function matches(string $selector): bool
  366. {
  367. if (!$this->nodes) {
  368. return false;
  369. }
  370. $converter = $this->createCssSelectorConverter();
  371. $xpath = $converter->toXPath($selector, 'self::');
  372. return 0 !== $this->filterRelativeXPath($xpath)->count();
  373. }
  374. /**
  375. * Return first parents (heading toward the document root) of the Element that matches the provided selector.
  376. *
  377. * @see https://developer.mozilla.org/en-US/docs/Web/API/Element/closest#Polyfill
  378. *
  379. * @throws \InvalidArgumentException When current node is empty
  380. */
  381. public function closest(string $selector): ?self
  382. {
  383. if (!$this->nodes) {
  384. throw new \InvalidArgumentException('The current node list is empty.');
  385. }
  386. $domNode = $this->getNode(0);
  387. while (\XML_ELEMENT_NODE === $domNode->nodeType) {
  388. $node = $this->createSubCrawler($domNode);
  389. if ($node->matches($selector)) {
  390. return $node;
  391. }
  392. $domNode = $node->getNode(0)->parentNode;
  393. }
  394. return null;
  395. }
  396. /**
  397. * Returns the next siblings nodes of the current selection.
  398. *
  399. * @return static
  400. *
  401. * @throws \InvalidArgumentException When current node is empty
  402. */
  403. public function nextAll()
  404. {
  405. if (!$this->nodes) {
  406. throw new \InvalidArgumentException('The current node list is empty.');
  407. }
  408. return $this->createSubCrawler($this->sibling($this->getNode(0)));
  409. }
  410. /**
  411. * Returns the previous sibling nodes of the current selection.
  412. *
  413. * @return static
  414. *
  415. * @throws \InvalidArgumentException
  416. */
  417. public function previousAll()
  418. {
  419. if (!$this->nodes) {
  420. throw new \InvalidArgumentException('The current node list is empty.');
  421. }
  422. return $this->createSubCrawler($this->sibling($this->getNode(0), 'previousSibling'));
  423. }
  424. /**
  425. * Returns the parents nodes of the current selection.
  426. *
  427. * @return static
  428. *
  429. * @throws \InvalidArgumentException When current node is empty
  430. */
  431. public function parents()
  432. {
  433. if (!$this->nodes) {
  434. throw new \InvalidArgumentException('The current node list is empty.');
  435. }
  436. $node = $this->getNode(0);
  437. $nodes = [];
  438. while ($node = $node->parentNode) {
  439. if (\XML_ELEMENT_NODE === $node->nodeType) {
  440. $nodes[] = $node;
  441. }
  442. }
  443. return $this->createSubCrawler($nodes);
  444. }
  445. /**
  446. * Returns the children nodes of the current selection.
  447. *
  448. * @return static
  449. *
  450. * @throws \InvalidArgumentException When current node is empty
  451. * @throws \RuntimeException If the CssSelector Component is not available and $selector is provided
  452. */
  453. public function children(string $selector = null)
  454. {
  455. if (!$this->nodes) {
  456. throw new \InvalidArgumentException('The current node list is empty.');
  457. }
  458. if (null !== $selector) {
  459. $converter = $this->createCssSelectorConverter();
  460. $xpath = $converter->toXPath($selector, 'child::');
  461. return $this->filterRelativeXPath($xpath);
  462. }
  463. $node = $this->getNode(0)->firstChild;
  464. return $this->createSubCrawler($node ? $this->sibling($node) : []);
  465. }
  466. /**
  467. * Returns the attribute value of the first node of the list.
  468. *
  469. * @return string|null The attribute value or null if the attribute does not exist
  470. *
  471. * @throws \InvalidArgumentException When current node is empty
  472. */
  473. public function attr(string $attribute)
  474. {
  475. if (!$this->nodes) {
  476. throw new \InvalidArgumentException('The current node list is empty.');
  477. }
  478. $node = $this->getNode(0);
  479. return $node->hasAttribute($attribute) ? $node->getAttribute($attribute) : null;
  480. }
  481. /**
  482. * Returns the node name of the first node of the list.
  483. *
  484. * @return string The node name
  485. *
  486. * @throws \InvalidArgumentException When current node is empty
  487. */
  488. public function nodeName()
  489. {
  490. if (!$this->nodes) {
  491. throw new \InvalidArgumentException('The current node list is empty.');
  492. }
  493. return $this->getNode(0)->nodeName;
  494. }
  495. /**
  496. * Returns the text of the first node of the list.
  497. *
  498. * Pass true as the second argument to normalize whitespaces.
  499. *
  500. * @param string|null $default When not null: the value to return when the current node is empty
  501. * @param bool $normalizeWhitespace Whether whitespaces should be trimmed and normalized to single spaces
  502. *
  503. * @return string The node value
  504. *
  505. * @throws \InvalidArgumentException When current node is empty
  506. */
  507. public function text(string $default = null, bool $normalizeWhitespace = true)
  508. {
  509. if (!$this->nodes) {
  510. if (null !== $default) {
  511. return $default;
  512. }
  513. throw new \InvalidArgumentException('The current node list is empty.');
  514. }
  515. $text = $this->getNode(0)->nodeValue;
  516. if ($normalizeWhitespace) {
  517. return trim(preg_replace('/(?:\s{2,}+|[^\S ])/', ' ', $text));
  518. }
  519. return $text;
  520. }
  521. /**
  522. * Returns the first node of the list as HTML.
  523. *
  524. * @param string|null $default When not null: the value to return when the current node is empty
  525. *
  526. * @return string The node html
  527. *
  528. * @throws \InvalidArgumentException When current node is empty
  529. */
  530. public function html(string $default = null)
  531. {
  532. if (!$this->nodes) {
  533. if (null !== $default) {
  534. return $default;
  535. }
  536. throw new \InvalidArgumentException('The current node list is empty.');
  537. }
  538. $node = $this->getNode(0);
  539. $owner = $node->ownerDocument;
  540. if (null !== $this->html5Parser && '<!DOCTYPE html>' === $owner->saveXML($owner->childNodes[0])) {
  541. $owner = $this->html5Parser;
  542. }
  543. $html = '';
  544. foreach ($node->childNodes as $child) {
  545. $html .= $owner->saveHTML($child);
  546. }
  547. return $html;
  548. }
  549. public function outerHtml(): string
  550. {
  551. if (!\count($this)) {
  552. throw new \InvalidArgumentException('The current node list is empty.');
  553. }
  554. $node = $this->getNode(0);
  555. $owner = $node->ownerDocument;
  556. if (null !== $this->html5Parser && '<!DOCTYPE html>' === $owner->saveXML($owner->childNodes[0])) {
  557. $owner = $this->html5Parser;
  558. }
  559. return $owner->saveHTML($node);
  560. }
  561. /**
  562. * Evaluates an XPath expression.
  563. *
  564. * Since an XPath expression might evaluate to either a simple type or a \DOMNodeList,
  565. * this method will return either an array of simple types or a new Crawler instance.
  566. *
  567. * @return array|Crawler An array of evaluation results or a new Crawler instance
  568. */
  569. public function evaluate(string $xpath)
  570. {
  571. if (null === $this->document) {
  572. throw new \LogicException('Cannot evaluate the expression on an uninitialized crawler.');
  573. }
  574. $data = [];
  575. $domxpath = $this->createDOMXPath($this->document, $this->findNamespacePrefixes($xpath));
  576. foreach ($this->nodes as $node) {
  577. $data[] = $domxpath->evaluate($xpath, $node);
  578. }
  579. if (isset($data[0]) && $data[0] instanceof \DOMNodeList) {
  580. return $this->createSubCrawler($data);
  581. }
  582. return $data;
  583. }
  584. /**
  585. * Extracts information from the list of nodes.
  586. *
  587. * You can extract attributes or/and the node value (_text).
  588. *
  589. * Example:
  590. *
  591. * $crawler->filter('h1 a')->extract(['_text', 'href']);
  592. *
  593. * @return array An array of extracted values
  594. */
  595. public function extract(array $attributes)
  596. {
  597. $count = \count($attributes);
  598. $data = [];
  599. foreach ($this->nodes as $node) {
  600. $elements = [];
  601. foreach ($attributes as $attribute) {
  602. if ('_text' === $attribute) {
  603. $elements[] = $node->nodeValue;
  604. } elseif ('_name' === $attribute) {
  605. $elements[] = $node->nodeName;
  606. } else {
  607. $elements[] = $node->getAttribute($attribute);
  608. }
  609. }
  610. $data[] = 1 === $count ? $elements[0] : $elements;
  611. }
  612. return $data;
  613. }
  614. /**
  615. * Filters the list of nodes with an XPath expression.
  616. *
  617. * The XPath expression is evaluated in the context of the crawler, which
  618. * is considered as a fake parent of the elements inside it.
  619. * This means that a child selector "div" or "./div" will match only
  620. * the div elements of the current crawler, not their children.
  621. *
  622. * @return static
  623. */
  624. public function filterXPath(string $xpath)
  625. {
  626. $xpath = $this->relativize($xpath);
  627. // If we dropped all expressions in the XPath while preparing it, there would be no match
  628. if ('' === $xpath) {
  629. return $this->createSubCrawler(null);
  630. }
  631. return $this->filterRelativeXPath($xpath);
  632. }
  633. /**
  634. * Filters the list of nodes with a CSS selector.
  635. *
  636. * This method only works if you have installed the CssSelector Symfony Component.
  637. *
  638. * @return static
  639. *
  640. * @throws \RuntimeException if the CssSelector Component is not available
  641. */
  642. public function filter(string $selector)
  643. {
  644. $converter = $this->createCssSelectorConverter();
  645. // The CssSelector already prefixes the selector with descendant-or-self::
  646. return $this->filterRelativeXPath($converter->toXPath($selector));
  647. }
  648. /**
  649. * Selects links by name or alt value for clickable images.
  650. *
  651. * @return static
  652. */
  653. public function selectLink(string $value)
  654. {
  655. return $this->filterRelativeXPath(
  656. sprintf('descendant-or-self::a[contains(concat(\' \', normalize-space(string(.)), \' \'), %1$s) or ./img[contains(concat(\' \', normalize-space(string(@alt)), \' \'), %1$s)]]', static::xpathLiteral(' '.$value.' '))
  657. );
  658. }
  659. /**
  660. * Selects images by alt value.
  661. *
  662. * @return static A new instance of Crawler with the filtered list of nodes
  663. */
  664. public function selectImage(string $value)
  665. {
  666. $xpath = sprintf('descendant-or-self::img[contains(normalize-space(string(@alt)), %s)]', static::xpathLiteral($value));
  667. return $this->filterRelativeXPath($xpath);
  668. }
  669. /**
  670. * Selects a button by name or alt value for images.
  671. *
  672. * @return static
  673. */
  674. public function selectButton(string $value)
  675. {
  676. return $this->filterRelativeXPath(
  677. sprintf('descendant-or-self::input[((contains(%1$s, "submit") or contains(%1$s, "button")) and contains(concat(\' \', normalize-space(string(@value)), \' \'), %2$s)) or (contains(%1$s, "image") and contains(concat(\' \', normalize-space(string(@alt)), \' \'), %2$s)) or @id=%3$s or @name=%3$s] | descendant-or-self::button[contains(concat(\' \', normalize-space(string(.)), \' \'), %2$s) or @id=%3$s or @name=%3$s]', 'translate(@type, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz")', static::xpathLiteral(' '.$value.' '), static::xpathLiteral($value))
  678. );
  679. }
  680. /**
  681. * Returns a Link object for the first node in the list.
  682. *
  683. * @return Link A Link instance
  684. *
  685. * @throws \InvalidArgumentException If the current node list is empty or the selected node is not instance of DOMElement
  686. */
  687. public function link(string $method = 'get')
  688. {
  689. if (!$this->nodes) {
  690. throw new \InvalidArgumentException('The current node list is empty.');
  691. }
  692. $node = $this->getNode(0);
  693. if (!$node instanceof \DOMElement) {
  694. throw new \InvalidArgumentException(sprintf('The selected node should be instance of DOMElement, got "%s".', get_debug_type($node)));
  695. }
  696. return new Link($node, $this->baseHref, $method);
  697. }
  698. /**
  699. * Returns an array of Link objects for the nodes in the list.
  700. *
  701. * @return Link[] An array of Link instances
  702. *
  703. * @throws \InvalidArgumentException If the current node list contains non-DOMElement instances
  704. */
  705. public function links()
  706. {
  707. $links = [];
  708. foreach ($this->nodes as $node) {
  709. if (!$node instanceof \DOMElement) {
  710. throw new \InvalidArgumentException(sprintf('The current node list should contain only DOMElement instances, "%s" found.', get_debug_type($node)));
  711. }
  712. $links[] = new Link($node, $this->baseHref, 'get');
  713. }
  714. return $links;
  715. }
  716. /**
  717. * Returns an Image object for the first node in the list.
  718. *
  719. * @return Image An Image instance
  720. *
  721. * @throws \InvalidArgumentException If the current node list is empty
  722. */
  723. public function image()
  724. {
  725. if (!\count($this)) {
  726. throw new \InvalidArgumentException('The current node list is empty.');
  727. }
  728. $node = $this->getNode(0);
  729. if (!$node instanceof \DOMElement) {
  730. throw new \InvalidArgumentException(sprintf('The selected node should be instance of DOMElement, got "%s".', get_debug_type($node)));
  731. }
  732. return new Image($node, $this->baseHref);
  733. }
  734. /**
  735. * Returns an array of Image objects for the nodes in the list.
  736. *
  737. * @return Image[] An array of Image instances
  738. */
  739. public function images()
  740. {
  741. $images = [];
  742. foreach ($this as $node) {
  743. if (!$node instanceof \DOMElement) {
  744. throw new \InvalidArgumentException(sprintf('The current node list should contain only DOMElement instances, "%s" found.', get_debug_type($node)));
  745. }
  746. $images[] = new Image($node, $this->baseHref);
  747. }
  748. return $images;
  749. }
  750. /**
  751. * Returns a Form object for the first node in the list.
  752. *
  753. * @return Form A Form instance
  754. *
  755. * @throws \InvalidArgumentException If the current node list is empty or the selected node is not instance of DOMElement
  756. */
  757. public function form(array $values = null, string $method = null)
  758. {
  759. if (!$this->nodes) {
  760. throw new \InvalidArgumentException('The current node list is empty.');
  761. }
  762. $node = $this->getNode(0);
  763. if (!$node instanceof \DOMElement) {
  764. throw new \InvalidArgumentException(sprintf('The selected node should be instance of DOMElement, got "%s".', get_debug_type($node)));
  765. }
  766. $form = new Form($node, $this->uri, $method, $this->baseHref);
  767. if (null !== $values) {
  768. $form->setValues($values);
  769. }
  770. return $form;
  771. }
  772. /**
  773. * Overloads a default namespace prefix to be used with XPath and CSS expressions.
  774. */
  775. public function setDefaultNamespacePrefix(string $prefix)
  776. {
  777. $this->defaultNamespacePrefix = $prefix;
  778. }
  779. public function registerNamespace(string $prefix, string $namespace)
  780. {
  781. $this->namespaces[$prefix] = $namespace;
  782. }
  783. /**
  784. * Converts string for XPath expressions.
  785. *
  786. * Escaped characters are: quotes (") and apostrophe (').
  787. *
  788. * Examples:
  789. *
  790. * echo Crawler::xpathLiteral('foo " bar');
  791. * //prints 'foo " bar'
  792. *
  793. * echo Crawler::xpathLiteral("foo ' bar");
  794. * //prints "foo ' bar"
  795. *
  796. * echo Crawler::xpathLiteral('a\'b"c');
  797. * //prints concat('a', "'", 'b"c')
  798. *
  799. * @return string Converted string
  800. */
  801. public static function xpathLiteral(string $s)
  802. {
  803. if (false === strpos($s, "'")) {
  804. return sprintf("'%s'", $s);
  805. }
  806. if (false === strpos($s, '"')) {
  807. return sprintf('"%s"', $s);
  808. }
  809. $string = $s;
  810. $parts = [];
  811. while (true) {
  812. if (false !== $pos = strpos($string, "'")) {
  813. $parts[] = sprintf("'%s'", substr($string, 0, $pos));
  814. $parts[] = "\"'\"";
  815. $string = substr($string, $pos + 1);
  816. } else {
  817. $parts[] = "'$string'";
  818. break;
  819. }
  820. }
  821. return sprintf('concat(%s)', implode(', ', $parts));
  822. }
  823. /**
  824. * Filters the list of nodes with an XPath expression.
  825. *
  826. * The XPath expression should already be processed to apply it in the context of each node.
  827. *
  828. * @return static
  829. */
  830. private function filterRelativeXPath(string $xpath): object
  831. {
  832. $prefixes = $this->findNamespacePrefixes($xpath);
  833. $crawler = $this->createSubCrawler(null);
  834. foreach ($this->nodes as $node) {
  835. $domxpath = $this->createDOMXPath($node->ownerDocument, $prefixes);
  836. $crawler->add($domxpath->query($xpath, $node));
  837. }
  838. return $crawler;
  839. }
  840. /**
  841. * Make the XPath relative to the current context.
  842. *
  843. * The returned XPath will match elements matching the XPath inside the current crawler
  844. * when running in the context of a node of the crawler.
  845. */
  846. private function relativize(string $xpath): string
  847. {
  848. $expressions = [];
  849. // An expression which will never match to replace expressions which cannot match in the crawler
  850. // We cannot drop
  851. $nonMatchingExpression = 'a[name() = "b"]';
  852. $xpathLen = \strlen($xpath);
  853. $openedBrackets = 0;
  854. $startPosition = strspn($xpath, " \t\n\r\0\x0B");
  855. for ($i = $startPosition; $i <= $xpathLen; ++$i) {
  856. $i += strcspn($xpath, '"\'[]|', $i);
  857. if ($i < $xpathLen) {
  858. switch ($xpath[$i]) {
  859. case '"':
  860. case "'":
  861. if (false === $i = strpos($xpath, $xpath[$i], $i + 1)) {
  862. return $xpath; // The XPath expression is invalid
  863. }
  864. continue 2;
  865. case '[':
  866. ++$openedBrackets;
  867. continue 2;
  868. case ']':
  869. --$openedBrackets;
  870. continue 2;
  871. }
  872. }
  873. if ($openedBrackets) {
  874. continue;
  875. }
  876. if ($startPosition < $xpathLen && '(' === $xpath[$startPosition]) {
  877. // If the union is inside some braces, we need to preserve the opening braces and apply
  878. // the change only inside it.
  879. $j = 1 + strspn($xpath, "( \t\n\r\0\x0B", $startPosition + 1);
  880. $parenthesis = substr($xpath, $startPosition, $j);
  881. $startPosition += $j;
  882. } else {
  883. $parenthesis = '';
  884. }
  885. $expression = rtrim(substr($xpath, $startPosition, $i - $startPosition));
  886. if (0 === strpos($expression, 'self::*/')) {
  887. $expression = './'.substr($expression, 8);
  888. }
  889. // add prefix before absolute element selector
  890. if ('' === $expression) {
  891. $expression = $nonMatchingExpression;
  892. } elseif (0 === strpos($expression, '//')) {
  893. $expression = 'descendant-or-self::'.substr($expression, 2);
  894. } elseif (0 === strpos($expression, './/')) {
  895. $expression = 'descendant-or-self::'.substr($expression, 3);
  896. } elseif (0 === strpos($expression, './')) {
  897. $expression = 'self::'.substr($expression, 2);
  898. } elseif (0 === strpos($expression, 'child::')) {
  899. $expression = 'self::'.substr($expression, 7);
  900. } elseif ('/' === $expression[0] || '.' === $expression[0] || 0 === strpos($expression, 'self::')) {
  901. $expression = $nonMatchingExpression;
  902. } elseif (0 === strpos($expression, 'descendant::')) {
  903. $expression = 'descendant-or-self::'.substr($expression, 12);
  904. } elseif (preg_match('/^(ancestor|ancestor-or-self|attribute|following|following-sibling|namespace|parent|preceding|preceding-sibling)::/', $expression)) {
  905. // the fake root has no parent, preceding or following nodes and also no attributes (even no namespace attributes)
  906. $expression = $nonMatchingExpression;
  907. } elseif (0 !== strpos($expression, 'descendant-or-self::')) {
  908. $expression = 'self::'.$expression;
  909. }
  910. $expressions[] = $parenthesis.$expression;
  911. if ($i === $xpathLen) {
  912. return implode(' | ', $expressions);
  913. }
  914. $i += strspn($xpath, " \t\n\r\0\x0B", $i + 1);
  915. $startPosition = $i + 1;
  916. }
  917. return $xpath; // The XPath expression is invalid
  918. }
  919. /**
  920. * @return \DOMNode|null
  921. */
  922. public function getNode(int $position)
  923. {
  924. return $this->nodes[$position] ?? null;
  925. }
  926. /**
  927. * @return int
  928. */
  929. public function count()
  930. {
  931. return \count($this->nodes);
  932. }
  933. /**
  934. * @return \ArrayIterator|\DOMNode[]
  935. */
  936. public function getIterator()
  937. {
  938. return new \ArrayIterator($this->nodes);
  939. }
  940. /**
  941. * @param \DOMElement $node
  942. *
  943. * @return array
  944. */
  945. protected function sibling($node, string $siblingDir = 'nextSibling')
  946. {
  947. $nodes = [];
  948. $currentNode = $this->getNode(0);
  949. do {
  950. if ($node !== $currentNode && \XML_ELEMENT_NODE === $node->nodeType) {
  951. $nodes[] = $node;
  952. }
  953. } while ($node = $node->$siblingDir);
  954. return $nodes;
  955. }
  956. private function parseHtml5(string $htmlContent, string $charset = 'UTF-8'): \DOMDocument
  957. {
  958. return $this->html5Parser->parse($this->convertToHtmlEntities($htmlContent, $charset), [], $charset);
  959. }
  960. private function parseXhtml(string $htmlContent, string $charset = 'UTF-8'): \DOMDocument
  961. {
  962. $htmlContent = $this->convertToHtmlEntities($htmlContent, $charset);
  963. $internalErrors = libxml_use_internal_errors(true);
  964. if (\LIBXML_VERSION < 20900) {
  965. $disableEntities = libxml_disable_entity_loader(true);
  966. }
  967. $dom = new \DOMDocument('1.0', $charset);
  968. $dom->validateOnParse = true;
  969. if ('' !== trim($htmlContent)) {
  970. @$dom->loadHTML($htmlContent);
  971. }
  972. libxml_use_internal_errors($internalErrors);
  973. if (\LIBXML_VERSION < 20900) {
  974. libxml_disable_entity_loader($disableEntities);
  975. }
  976. return $dom;
  977. }
  978. /**
  979. * Converts charset to HTML-entities to ensure valid parsing.
  980. */
  981. private function convertToHtmlEntities(string $htmlContent, string $charset = 'UTF-8'): string
  982. {
  983. set_error_handler(function () { throw new \Exception(); });
  984. try {
  985. return mb_convert_encoding($htmlContent, 'HTML-ENTITIES', $charset);
  986. } catch (\Exception | \ValueError $e) {
  987. try {
  988. $htmlContent = iconv($charset, 'UTF-8', $htmlContent);
  989. $htmlContent = mb_convert_encoding($htmlContent, 'HTML-ENTITIES', 'UTF-8');
  990. } catch (\Exception | \ValueError $e) {
  991. }
  992. return $htmlContent;
  993. } finally {
  994. restore_error_handler();
  995. }
  996. }
  997. /**
  998. * @throws \InvalidArgumentException
  999. */
  1000. private function createDOMXPath(\DOMDocument $document, array $prefixes = []): \DOMXPath
  1001. {
  1002. $domxpath = new \DOMXPath($document);
  1003. foreach ($prefixes as $prefix) {
  1004. $namespace = $this->discoverNamespace($domxpath, $prefix);
  1005. if (null !== $namespace) {
  1006. $domxpath->registerNamespace($prefix, $namespace);
  1007. }
  1008. }
  1009. return $domxpath;
  1010. }
  1011. /**
  1012. * @throws \InvalidArgumentException
  1013. */
  1014. private function discoverNamespace(\DOMXPath $domxpath, string $prefix): ?string
  1015. {
  1016. if (isset($this->namespaces[$prefix])) {
  1017. return $this->namespaces[$prefix];
  1018. }
  1019. // ask for one namespace, otherwise we'd get a collection with an item for each node
  1020. $namespaces = $domxpath->query(sprintf('(//namespace::*[name()="%s"])[last()]', $this->defaultNamespacePrefix === $prefix ? '' : $prefix));
  1021. return ($node = $namespaces->item(0)) ? $node->nodeValue : null;
  1022. }
  1023. private function findNamespacePrefixes(string $xpath): array
  1024. {
  1025. if (preg_match_all('/(?P<prefix>[a-z_][a-z_0-9\-\.]*+):[^"\/:]/i', $xpath, $matches)) {
  1026. return array_unique($matches['prefix']);
  1027. }
  1028. return [];
  1029. }
  1030. /**
  1031. * Creates a crawler for some subnodes.
  1032. *
  1033. * @param \DOMNodeList|\DOMNode|\DOMNode[]|string|null $nodes
  1034. *
  1035. * @return static
  1036. */
  1037. private function createSubCrawler($nodes): object
  1038. {
  1039. $crawler = new static($nodes, $this->uri, $this->baseHref);
  1040. $crawler->isHtml = $this->isHtml;
  1041. $crawler->document = $this->document;
  1042. $crawler->namespaces = $this->namespaces;
  1043. $crawler->html5Parser = $this->html5Parser;
  1044. return $crawler;
  1045. }
  1046. /**
  1047. * @throws \LogicException If the CssSelector Component is not available
  1048. */
  1049. private function createCssSelectorConverter(): CssSelectorConverter
  1050. {
  1051. if (!class_exists(CssSelectorConverter::class)) {
  1052. throw new \LogicException('To filter with a CSS selector, install the CssSelector component ("composer require symfony/css-selector"). Or use filterXpath instead.');
  1053. }
  1054. return new CssSelectorConverter($this->isHtml);
  1055. }
  1056. /**
  1057. * Parse string into DOMDocument object using HTML5 parser if the content is HTML5 and the library is available.
  1058. * Use libxml parser otherwise.
  1059. */
  1060. private function parseHtmlString(string $content, string $charset): \DOMDocument
  1061. {
  1062. if ($this->canParseHtml5String($content)) {
  1063. return $this->parseHtml5($content, $charset);
  1064. }
  1065. return $this->parseXhtml($content, $charset);
  1066. }
  1067. private function canParseHtml5String(string $content): bool
  1068. {
  1069. if (null === $this->html5Parser) {
  1070. return false;
  1071. }
  1072. if (false === ($pos = stripos($content, '<!doctype html>'))) {
  1073. return false;
  1074. }
  1075. $header = substr($content, 0, $pos);
  1076. return '' === $header || $this->isValidHtml5Heading($header);
  1077. }
  1078. private function isValidHtml5Heading(string $heading): bool
  1079. {
  1080. return 1 === preg_match('/^\x{FEFF}?\s*(<!--[^>]*?-->\s*)*$/u', $heading);
  1081. }
  1082. }