44
55use Embed \Bag ;
66use Embed \Utils ;
7+ use Embed \Url ;
78
89/**
910 * Generic html provider.
@@ -14,6 +15,7 @@ class Html extends Provider implements ProviderInterface
1415{
1516 protected $ config = [
1617 'maxImages ' => -1 ,
18+ 'externalImages ' => false
1719 ];
1820
1921 /**
@@ -271,28 +273,30 @@ protected static function extractFromMeta(\DOMDocument $html, Bag $bag)
271273 */
272274 protected function extractImages (\DOMElement $ html )
273275 {
274- $ domain = $ this ->request ->getDomain ();
275-
276276 foreach ($ html ->getElementsByTagName ('img ' ) as $ img ) {
277277 if ($ img ->hasAttribute ('src ' )) {
278278 $ src = $ this ->request ->createUrl ($ img ->getAttribute ('src ' ));
279279
280280 //Avoid external images
281- if ($ src -> getContent () === null && $ src-> getDomain () !== $ domain ) {
281+ if (! $ this -> imageIsValid ( $ src) ) {
282282 continue ;
283283 }
284284
285285 $ parent = $ img ->parentNode ;
286286
287+ //The image is in a link
287288 while ($ parent && isset ($ parent ->tagName )) {
288289 if ($ parent ->tagName === 'a ' ) {
290+ //The link is external
289291 if ($ parent ->hasAttribute ('href ' )) {
290292 $ href = $ this ->request ->createUrl ($ parent ->getAttribute ('href ' ));
291293
292- if ($ href -> getDomain () !== $ domain ) {
294+ if (! $ this -> imageIsValid ( $ href ) ) {
293295 continue 2 ;
294296 }
295297 }
298+
299+ //The link has rel=nofollow
296300 if ($ parent ->hasAttribute ('rel ' ) && (string ) $ parent ->getAttribute ('rel ' ) === 'nofollow ' ) {
297301 continue 2 ;
298302 }
@@ -308,6 +312,23 @@ protected function extractImages(\DOMElement $html)
308312 }
309313 }
310314
315+ /**
316+ * Check whether a image url is valid or not
317+ *
318+ * @param Url $url
319+ *
320+ * return bool
321+ */
322+ protected function imageIsValid (Url $ url )
323+ {
324+ //base64 or same domain
325+ if ($ url ->getContent () !== null || $ url ->getDomain () === $ this ->request ->getDomain ()) {
326+ return true ;
327+ }
328+
329+ return is_bool ($ this ->config ['externalImages ' ]) ? $ this ->config ['externalImages ' ] : $ url ->match ($ this ->config ['externalImages ' ]);
330+ }
331+
311332 /**
312333 * Returns the main element of the document.
313334 *
0 commit comments