Sunday, April 24, 2011

The 2011 Scripting Games Advanced Event 6: Use PowerShell to Get Twitter Names from a Web Page

The 2011 Scripting Games Advanced Event 6: Use PowerShell to Get Twitter Names from a Web Page

My personal script:
http://2011sg.poshcode.org/1518
Average Rating: 4.00 by 2 users.
(Download it)


#
#
# 2011 Scripting Games Advanced Event 6: Use PowerShell to Get Twitter Names from a Web Page
#
# by F.Richard 2011-04
#
#

#Requires -Version 2.0

[CmdletBinding()]
Param(
[Parameter(Mandatory = $false, Position = 0, ValueFromPipeLine = $true, ValueFromPipelineByPropertyName = $true)]
[ValidateNotNullOrEmpty()]
[String] $url = "http://www.sqlsaturday.com/70/networking.aspx",

[Parameter(Mandatory = $false, Position = 1, ValueFromPipeLine = $False, ValueFromPipelineByPropertyName = $False)]
[ValidateNotNullOrEmpty()]
[String] $output = "TwitterName.txt",

[Parameter(Mandatory = $false, Position = 2, ValueFromPipeLine = $False, ValueFromPipelineByPropertyName = $False)]
[ValidateNotNullOrEmpty()]
[String] $regex = "twitter.com/(?[^?/ ]*)\b(?!:)"
)





# Get-WebPage
# Windows PowerShell, Invalid Certificates, and Automated Downloading
#http://blogs.technet.com/b/heyscriptingguy/archive/2010/07/25/windows-powershell-invalid-certificates-and-automated-downloading.aspx
# + some personal modifications
Function Get-WebPage {
<#
.Synopsis
Gets the content at a specified http url
.Parameter url
Url that returns content
.Parameter file
Optional parameter that redirects download of content to a file. If left out
content is returned as a string
.Parameter useragent
to define user agent
.Parameter user
user to permit connection
.Parameter password
password to permit connection
.Parameter domain
user's domain to permit connection
.Parameter proxyurl
proxy Url to permit connection
.Parameter proxyurl
proxy port to permit connection
.Parameter force
Forces the acceptance of content from an untrusted source (eg. invalid certificate)
.EXAMPLE
Get-WebPage -url "http://www.mysite.com"
return web page mysite.com
.EXAMPLE
Get-WebPage -url "http://www.mysite.com" -user "usr" -password "pass" -domain "mydom"
return web page mysite.com using particular user
.EXAMPLE
Get-WebPage -url "http://www.mysite.com" -file "contentsite.htm" -proxyurl "proxy.comp.net" -proxyport "8080"
return web page mysite.com to file "contentsite.htm" using proxy proxy.comp.net:8080
.EXAMPLE
Get-WebPage -url "http://www.mysite.com" -useragent "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705;)"
return web page mysite.com using specific user agent
#>
Param(
[string] $url,
[string] $file = "",
[string] $useragent = "",
[string] $user = "",
[string] $password = "",
[string] $domain = "",
[string] $proxyurl = "",
[string] $proxyport = "",
[switch] $force
)
if($force) {
[Net.ServicePointManager]::ServerCertificateValidationCallback = {$true}
}

$webclient = New-Object system.net.webclient
#Proxy required
if ($proxyurl -ne "" -or !$webClient.Proxy.IsBypassed($url)) {
$proxy = New-Object System.Net.WebProxy($proxyurl, $proxyport)
#$proxy.Credentials = (Get-Credential).GetNetworkCredential()
$webclient.Proxy = $proxy
}
if ($useragent -ne "") {
$webclient.Headers.Add("user-agent", $useragent) # :ex "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705;)"
}

if ($user -ne "") {
$webclient.Credentials = New-Object System.Net.NetworkCredential($user, $password, $domain)
}
$webClient.UseDefaultCredentials = $true

if ($file -eq "") {
return $webclient.DownloadString($url)
} else {
$webclient.DownloadFile($url, $file)
}
}



Function Get-RegexName {
<#
.Synopsis
return array of captured regex group
.Parameter content
content to capture
.Parameter regex
regular expression with group to capture
#>
Param(
[string] $content,
[string] $regex,
[string] $regexaction = "match",
[string] $regexoptions = "IgnoreCase"
)
$arrName = @( )
$matches = [regex]::Matches($content, $regex, $regexoptions)
foreach($match in $matches) {
if ($match.Groups["name"].Value.Trim().length -gt 0) {
$arrName += $match.Groups["name"].Value.Trim()
}
}
return $arrName
}



Function Get-TwitterName {
<#
.Synopsis
write all twitter name in a web page
.Parameter url
url to examine default: "http://www.sqlsaturday.com/70/networking.aspx"
.Parameter output
outup filename default: "TwitterName.txt"
.Parameter regex
regular expression with group to capture default: "twitter.com/(?[^?/ ]*)\b(?!:)"
#>

[CmdletBinding()]
Param(
[Parameter(Mandatory = $false, Position = 0, ValueFromPipeLine = $true, ValueFromPipelineByPropertyName = $true)]
[ValidateNotNullOrEmpty()]
[String] $url = "http://www.sqlsaturday.com/70/networking.aspx",

[Parameter(Mandatory = $false, Position = 1, ValueFromPipeLine = $False, ValueFromPipelineByPropertyName = $False)]
[ValidateNotNullOrEmpty()]
[String] $output = "TwitterName.txt",

[Parameter(Mandatory = $false, Position = 2, ValueFromPipeLine = $False, ValueFromPipelineByPropertyName = $False)]
[ValidateNotNullOrEmpty()]
[String] $regex = "twitter.com/(?[^?/ ]*)\b(?!:)"
)
# Get web page content
$content = Get-WebPage -url $url -force

# get all twitter name
# must use (?!:) trick to do not have line like this one
# a href="http://www.twitter.com/http://twitter.com/sqlvariant
$arrTwitName = Get-RegexName -content $content -regex $regex

# Write file
$arrTwitName | Out-File $output

}




# main
Get-TwitterName -url $url -output $output -regex $regex

No comments: