Utilisateur:Svgalbertian/TSX/Script

sedar_spider.pl

modifier
use LWP::Simple;
use LWP::UserAgent;
use HTTP::Request;
use HTTP::Response;
use HTML::LinkExtractor;
use HTML::Entities;
use Data::Dumper;
use Encode;
use utf8;

# Directory search variables
$dirURL = "https://www.sedar.com/issuers/company_issuers_\%char\%_en.htm";
$replaceme = '%char%';
@alpha = ("a" .. "z");
push(@alpha, "nc"); # add NC for non-alphabetic page

# Company listing variables
$comURL = "https://www.sedar.com/DisplayProfile.do?lang=\%lang\%&issuerType=03&issuerNo=\%issuerNo\%";
$issuerNoSub = '%issuerNo%';
$langSub = '%lang%';
@langArray = ("EN", "FR");

# Misc variables
my %ref_table;

# Output files
open ( CVSFILE, ">sedar_pipe.txt" ) || die ("you die now!");
open ( WIKITXT, ">wiki.txt" ) || die ("you die now!");

print CVSFILE "symbol|name_en|name_fr|industry|website|address|formed\n";
print WIKITXT "{| class=\"wikitable sortable\"\n!Symbole !! Nom_en !! Nom_fr || Lien\n|-\n";

# Cycle through all the directory pages
foreach $alpha (@alpha) {
 # Create the URL of the directory page
 (my $currenturl = $dirURL) =~s/$replaceme/$alpha/g;

 # Grab the directory page
 print "Grabbing page:  ". $currenturl . "\n";
 $contents = &getPage($currenturl);
 #print "$contents\n";

 # Extract all links
 my $page_parser = new HTML::LinkExtractor(undef, undef, 1);
 $page_parser->parse(\$contents);

 # Add matching company links to hash table
 for my $link( @{ $page_parser->links } ) {
   #print "Found link $$link{href}\n";
   if ($$link{href} =~ m/DisplayProfile.do/) {
        (my $issuerNo) = ($$link{href} =~ m/issuerNo=(\d{8})/);
   	$reftable{$issuerNo}{'name_en'} = encode('utf8', $$link{_TEXT});
   }
 }
}

print "Check company page...\n";

foreach $key (sort {"\U$reftable{$a}->{'name_en'}" cmp "\U$reftable{$b}->{'name_en'}" } keys %reftable) {

  # Build the english URL
  (my $currenturl = $comURL) =~s/$langSub/EN/g;
  $currenturl =~s/$issuerNoSub/$key/g;

  $reftable{$key}{'link_en'} = $currenturl;

  # Grab the page
  $contents = &getPage($currenturl);

  # Check if the company is active
  if (ParseProfile($contents,"Reporting Jurisdictions") eq "Ceased Reporting") {
	delete $reftable{$key};
	next;
  }
  
  # Check if the company is on the TSX
  $exchange = &ParseProfile($contents, "Stock Exchange");
  if ($exchange !~ m/TSX($|,)/) {
	delete $reftable{$key};
	next;
  }

  # Grab the rest of the data
  $reftable{$key}{'symbol'} = &ParseProfile($contents, "Stock Symbol");
  $reftable{$key}{'address'} = &ParseProfile($contents, "Head Office Address");
  $reftable{$key}{'formed'} = &ParseProfile($contents, "Date of Formation");
  $reftable{$key}{'industry'} = &ParseProfile($contents, "Industry Classification");	
  
  # Symbol adjust
  if (length ($reftable{$key}{'symbol'}) > 3) {
	$reftable{$key}{'symbol'} = substr($reftable{$key}{'symbol'}, 0, 3) . 
		"." . substr($reftable{$key}{'symbol'}, 3);
  }
 
  # Guess website from email address;
  $email = &ParseProfile($contents, "Business e-mail address");
  if ($email =~ m/@/) {
        $email =~ m/@(.*?)$/gism;
	$reftable{$key}{'website'} = "http://www." . $1;
  } else {
	$reftable{$key}{'website'} = "";
  }

  # Build the french URL
  (my $currenturl = $comURL) =~s/$langSub/FR/g;
  $currenturl =~s/$issuerNoSub/$key/g;

  # Grab the french profile
  $contents = &getPage($currenturl);

  # Save the french name
  $contents =~ m/<STRONG>(.*?)<\/STRONG>/;
  $reftable{$key}{'name_fr'} = encode('utf8', HTML::Entities::decode($1));

  # Cleanup names
  $reftable{$key}{'name_en'} =~ s/\s+/ /g;
  $reftable{$key}{'name_en'} =~ s/ \((formerly).*//gi;
  $reftable{$key}{'name_fr'} =~ s/\s+/ /g;
  $reftable{$key}{'name_fr'} =~ s/ \((formerly|anciennement|auparavant).*//gi;

  #Print status to screen	
  print $reftable{$key}{'symbol'} . "\t" . $reftable{$key}{'name_en'} . "\n";

  #Print pipe delimited file	
  print CVSFILE $reftable{$key}{'symbol'} .
	"|" . $reftable{$key}{'name_en'} .
	"|" . $reftable{$key}{'name_fr'} .
	"|" . $reftable{$key}{'industry'} .
	"|" . $reftable{$key}{'website'} .
	"|" . $reftable{$key}{'address'} .
	"|" . $reftable{$key}{'formed'} . "\n";

  #Print wikifile
  if ($reftable{$key}{'name_fr'} ne $reftable{$key}{'name_en'}) {
	print "\t{fr} " . $reftable{$key}{'name_fr'} . "\n";
	print WIKITXT "|" . $reftable{$key}{'symbol'} . 
		"||" . $reftable{$key}{'name_en'} .
		"||" . $reftable{$key}{'name_fr'} .
		"|| [" . $reftable{$key}{'link_en'} . "] \n|-\n";
  }
}

# Close the files
print WIKITXT "|}";
close (WIKITXT);
close (CVSFILE);

use strict;
sub ParseProfile {
  # Function to parse items from the profile
  my($contents, $expression) = @_;
  my $extract;

  $contents =~ m{$expression:</td>(.*?)</td>}gism;
  $extract = $1;
  $extract =~ m/>(.*?)$/gism;
  $extract = $1;

  return $extract;
}

use strict;
sub getPage {
  # Function to get a webpage
  my($currenturl) = @_;
  my $contents;

  my $browser = LWP::UserAgent->new();
  $browser->timeout(10);
  $browser->agent("Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.53 Safari/525.19");

  my $request = HTTP::Request->new(GET => $currenturl);
  $request->header(Accept => "text/html, */*;q=0.1", referer => 'http://google.com');
  my $response = $browser->request($request);

  # Convert from ISO-8859 to utf8
  $contents = encode("utf8",decode("ISO-8859-1",$response->content()));

  return $contents;
}