Skip to content

Commit

Permalink
Allow use to specify charset
Browse files Browse the repository at this point in the history
  • Loading branch information
ericchiang committed Dec 14, 2014
1 parent d00d654 commit 9e98bc2
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 21 deletions.
11 changes: 1 addition & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,6 @@ If you're on OS X, use [Brew](http://brew.sh/) to install (no Go required).

brew install https://raw.githubusercontent.com/EricChiang/pup/master/pup.rb

For linux distrubtions use the following commands to install under your `PATH`
environment variable. You can set `ARCH` to `linux_386` for 32-bit infrastructures.

ARCH=linux_amd64
cd /tmp
wget https://github.com/EricChiang/pup/releases/download/v0.3.5/pup_${ARCH}.zip
unzip pup_${ARCH}.zip && rm pup_${ARCH}.zip
sudo mv pup /usr/local/bin
pup --version

## Quick start

```bash
Expand Down Expand Up @@ -353,5 +343,6 @@ output of pup into a more consumable format.
-i --indent number of spaces to use for indent or character
-n --number print number of elements selected
-l --limit restrict number of levels printed
--charset specify the charset for pup to use
--version display version
```
29 changes: 29 additions & 0 deletions parse.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,41 @@ import (
"os"
"strconv"
"strings"

"golang.org/x/net/html"
"golang.org/x/net/html/charset"
"golang.org/x/text/transform"
)

var (
pupIn io.ReadCloser = os.Stdin
pupCharset string = ""
pupMaxPrintLevel int = -1
pupPrintColor bool = false
pupIndentString string = " "
pupDisplayer Displayer = TreeDisplayer{}
)

// Parse the html while handling the charset
func ParseHTML(r io.Reader, cs string) (*html.Node, error) {
var err error
if cs == "" {
// attempt to guess the charset of the HTML document
r, err = charset.NewReader(r, "")
if err != nil {
return nil, err
}
} else {
// let the user specify the charset
e, name := charset.Lookup(cs)
if name == "" {
return nil, fmt.Errorf("'%s' is not a valid charset", cs)
}
r = transform.NewReader(r, e.NewDecoder())
}
return html.Parse(r)
}

func PrintHelp(w io.Writer, exitCode int) {
helpString := `Usage
pup [flags] [selectors] [optional display function]
Expand All @@ -28,6 +53,7 @@ Flags
-i --indent number of spaces to use for indent or character
-n --number print number of elements selected
-l --limit restrict number of levels printed
--charset specify the charset for pup to use
--version display version
`
fmt.Fprintf(w, helpString, VERSION)
Expand Down Expand Up @@ -81,6 +107,9 @@ func ProcessFlags(cmds []string) (nonFlagCmds []string, err error) {
return []string{}, fmt.Errorf("Argument for '%s' must be numeric", cmd)
}
i++
case "--charset":
pupCharset = cmds[i+1]
i++
case "--version":
fmt.Println(VERSION)
os.Exit(0)
Expand Down
15 changes: 4 additions & 11 deletions pup.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ import (
"os"

"golang.org/x/net/html"
"golang.org/x/net/html/charset"
)

// _=,_
Expand All @@ -17,7 +16,7 @@ import (
// |/ \_( # |"
// C/ ,--___/

var VERSION string = "0.3.6"
var VERSION string = "0.3.7"

func main() {
// process flags and arguments
Expand All @@ -27,19 +26,13 @@ func main() {
os.Exit(2)
}

// Determine the charset of the input
cr, err := charset.NewReader(pupIn, "")
if err != nil {
fmt.Fprintf(os.Stderr, err.Error())
os.Exit(2)
}

// Parse the input and get the root node
root, err := html.Parse(cr)
root, err := ParseHTML(pupIn, pupCharset)
if err != nil {
fmt.Fprintf(os.Stderr, err.Error())
fmt.Fprintf(os.Stderr, "%s\n", err.Error())
os.Exit(2)
}
pupIn.Close()

// Parse the selectors
selectorFuncs := []SelectorFunc{}
Expand Down

0 comments on commit 9e98bc2

Please sign in to comment.