diff options
author | Matthew Lemon <matt@matthewlemon.com> | 2022-09-13 19:59:24 +0100 |
---|---|---|
committer | Matthew Lemon <matt@matthewlemon.com> | 2022-09-13 19:59:24 +0100 |
commit | bcac8134ea12369d6cf8ba7068ec78fd49a47223 (patch) | |
tree | bfb56b1260df902ad532b4e2d409e3513719c097 /writing_tools/random_line.pl | |
parent | b3ce73c143be4f3a69815c370c61339ecb94f47d (diff) |
better processing of urls
Diffstat (limited to 'writing_tools/random_line.pl')
-rw-r--r-- | writing_tools/random_line.pl | 35 |
1 files changed, 27 insertions, 8 deletions
diff --git a/writing_tools/random_line.pl b/writing_tools/random_line.pl index a7f2eb8..fe19ab7 100644 --- a/writing_tools/random_line.pl +++ b/writing_tools/random_line.pl @@ -37,6 +37,13 @@ foreach my $f (glob("$dir/*.md")) { } close $fh or die "can't read close file '$f': $OS_ERROR"; } + +sub striptime { + my $url = shift; + $url =~ s/\?t=\d*//; + return $url; +} + # # Let's interact with the World Wide Web! my $ua = LWP::UserAgent->new; @@ -47,14 +54,13 @@ foreach my $line (@targetlines) { if ($line =~ m/$RE{URI}{HTTP}{-scheme => qr<https?>}{-keep}/) { my$t = $1; $t =~ s/\.$//; # remove the fullstop if it has one at the end - # print "Saving: $t\n"; - # push @urls => $t - my $req = HTTP::Request->new(GET => $t); - $req->header(Accept => "text/html"); - my $res = $ua->request($req); - my $p = HTML::HeadParser->new; - $p->parse($res->content) and print "not finished"; - print $p->header('Title'), "\n"; + push @urls => striptime($t) + # my $req = HTTP::Request->new(GET => $t); + # $req->header(Accept => "text/html"); + # my $res = $ua->request($req); + # my $p = HTML::HeadParser->new; + # $p->parse($res->content) and print "not finished"; + # print $p->header('Title'), "\n"; # my $root = HTML::TreeBuilder->new_from_content($res->content); # my $title = $root->look_down('_tag' => 'title'); # my $value = $title->attr('value'); @@ -62,6 +68,19 @@ foreach my $line (@targetlines) { } +foreach my $url (@urls) { + print "URL: $url\n"; + my $req = HTTP::Request->new(GET => $url); + $req->header(Accept => "text/html"); + my $res = $ua->request($req); + my $p = HTML::HeadParser->new; + $p->parse($res->content) and print "not finished"; + print "TITLE:", $p->header('Title'), "\n"; + print "\n"; +} + + + # foreach my $url (@urls) { # print $url; # my $req = HTTP::Request->new(GET => $url); |