Skip to content

Commit

Permalink
Merge pull request #23 from EricChiang/json
Browse files Browse the repository at this point in the history
json{} displayer added
  • Loading branch information
ericchiang committed Oct 11, 2014
2 parents dd9e318 + dfe4a38 commit 571adeb
Show file tree
Hide file tree
Showing 4 changed files with 223 additions and 79 deletions.
95 changes: 90 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,16 +34,22 @@ Ew, HTML. Let's run that through some pup selectors:
$ curl -s https://news.ycombinator.com/ | pup 'td.title a[href^=http] attr{href}'
```

Even better, let's grab the titles too:

```bash
$ curl -s https://news.ycombinator.com/ | pup 'td.title a[href^=http] json{}'
```

## Basic Usage

```bash
$ cat index.html | pup [selectors and flags]
$ cat index.html | pup [flags] [selectors] [optional display function]
```

or

```bash
$ pup < index.html [selectors and flags]
$ pup < index.html [flags] [selectors] [optional display function]
```

## Examples
Expand Down Expand Up @@ -185,7 +191,7 @@ You can mix and match selectors as you wish.
cat index.html | pup element#id[attribute=value]
```

## Functions
## Display Functions

Non-HTML selectors which effect the output type are implemented as functions
which can be provided as a final argument.
Expand Down Expand Up @@ -231,6 +237,85 @@ $ pup < robots.html a attr{href} | head
//en.wikivoyage.org/wiki/
```

#### `json{}`

Print HTML as JSON.

```bash
$ cat robots.html | pup div#p-namespaces a
<a href="/wiki/Robots_exclusion_standard" title="View the content page [c]" accesskey="c">
Article
</a>
<a href="/wiki/Talk:Robots_exclusion_standard" title="Discussion about the content page [t]" accesskey="t">
Talk
</a>
```

```bash
$ cat robots.html | pup div#p-namespaces a json{}
[
{
"attrs": {
"accesskey": "c",
"href": "/wiki/Robots_exclusion_standard",
"title": "View the content page [c]"
},
"tag": "a",
"text": "Article"
},
{
"attrs": {
"accesskey": "t",
"href": "/wiki/Talk:Robots_exclusion_standard",
"title": "Discussion about the content page [t]"
},
"tag": "a",
"text": "Talk"
}
]
```

Use the `-i` / `--indent` flag to control the intent level.

```bash
$ cat robots.html | pup --indent 4 div#p-namespaces a json{}
[
{
"attrs": {
"accesskey": "c",
"href": "/wiki/Robots_exclusion_standard",
"title": "View the content page [c]"
},
"tag": "a",
"text": "Article"
},
{
"attrs": {
"accesskey": "t",
"href": "/wiki/Talk:Robots_exclusion_standard",
"title": "Discussion about the content page [t]"
},
"tag": "a",
"text": "Talk"
}
]
```

If the selectors only return one element the results will be printed as a JSON
object, not a list.

```bash
$ cat robots.html | pup --indent 4 title json{}
{
"tag": "title",
"text": "Robots exclusion standard - Wikipedia, the free encyclopedia"
}
```

Because there is no universal standard for converting HTML/XML to JSON, a
method has been chosen which hopefully fits. The goal is simply to get the
output of pup into a more consumable format.

## Flags

```bash
Expand All @@ -243,6 +328,6 @@ $ pup < robots.html a attr{href} | head
--version display version
```

## TODO:
## TODO

* Print as json function `json{}`
Add more tests!
130 changes: 130 additions & 0 deletions display.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
package main

import (
"encoding/json"
"fmt"
"regexp"
"strings"

"code.google.com/p/go.net/html"
)

type Displayer interface {
Display(nodes []*html.Node)
}

type TextDisplayer struct {
}

func (t TextDisplayer) Display(nodes []*html.Node) {
for _, node := range nodes {
if node.Type == html.TextNode {
fmt.Println(node.Data)
}
children := []*html.Node{}
child := node.FirstChild
for child != nil {
children = append(children, child)
child = child.NextSibling
}
t.Display(children)
}
}

type AttrDisplayer struct {
Attr string
}

func (a AttrDisplayer) Display(nodes []*html.Node) {
for _, node := range nodes {
attributes := node.Attr
for _, attr := range attributes {
if attr.Key == a.Attr {
val := html.EscapeString(attr.Val)
fmt.Printf("%s\n", val)
}
}
}
}

type JSONDisplayer struct {
}

// returns a jsonifiable struct
func jsonify(node *html.Node) map[string]interface{} {
vals := map[string]interface{}{}
if len(node.Attr) > 0 {
attrs := map[string]string{}
for _, attr := range node.Attr {
attrs[attr.Key] = html.EscapeString(attr.Val)
}
vals["attrs"] = attrs
}
vals["tag"] = node.DataAtom.String()
children := []interface{}{}
for child := node.FirstChild; child != nil; child = child.NextSibling {
switch child.Type {
case html.ElementNode:
children = append(children, jsonify(child))
case html.TextNode:
text := strings.TrimSpace(child.Data)
if text != "" {
// if there is already text we'll append it
currText, ok := vals["text"]
if ok {
text = fmt.Sprintf("%s %s", currText, text)
}
vals["text"] = text
}
}
}
return vals
}

func (j JSONDisplayer) Display(nodes []*html.Node) {
var data []byte
var err error
switch len(nodes) {
case 1:
jsonNode := jsonify(nodes[0])
data, err = json.MarshalIndent(&jsonNode, "", indentString)
default:
jsonNodes := []map[string]interface{}{}
for _, node := range nodes {
jsonNodes = append(jsonNodes, jsonify(node))
}
data, err = json.MarshalIndent(&jsonNodes, "", indentString)
}
if err != nil {
panic("Could not jsonify nodes")
}
fmt.Printf("%s\n", data)
}

var (
// Display function helpers
displayMatcher *regexp.Regexp = regexp.MustCompile(`\{[^\}]*\}$`)
textFuncMatcher = regexp.MustCompile(`^text\{\}$`)
attrFuncMatcher = regexp.MustCompile(`^attr\{([^\}]*)\}$`)
jsonFuncMatcher = regexp.MustCompile(`^json\{([^\}]*)\}$`)
)

func NewDisplayFunc(text string) (Displayer, error) {
if !displayMatcher.MatchString(text) {
return nil, fmt.Errorf("Not a display function")
}
switch {
case textFuncMatcher.MatchString(text):
return TextDisplayer{}, nil
case attrFuncMatcher.MatchString(text):
matches := attrFuncMatcher.FindStringSubmatch(text)
if len(matches) != 2 {
return nil, fmt.Errorf("")
} else {
return AttrDisplayer{matches[1]}, nil
}
case jsonFuncMatcher.MatchString(text):
return JSONDisplayer{}, nil
}
return nil, fmt.Errorf("Not a display function")
}
70 changes: 0 additions & 70 deletions funcs/display.go

This file was deleted.

7 changes: 3 additions & 4 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,14 @@ import (
"code.google.com/p/go.net/html"
"code.google.com/p/go.net/html/charset"
"fmt"
"github.com/ericchiang/pup/funcs"
"github.com/ericchiang/pup/selector"
"io"
"os"
"strconv"
"strings"
)

const VERSION string = "0.3.0"
const VERSION string = "0.3.1"

var (
// Flags
Expand All @@ -22,7 +21,7 @@ var (
maxPrintLevel int = -1
printNumber bool = false
printColor bool = false
displayer funcs.Displayer = nil
displayer Displayer = nil
)

// Print to stderr and exit
Expand Down Expand Up @@ -177,7 +176,7 @@ func main() {
// if this is the last element, check for a function like
// text{} or attr{}
if i+1 == len(cmds) {
d, err := funcs.NewDisplayFunc(cmd)
d, err := NewDisplayFunc(cmd)
if err == nil {
displayer = d
selectors = selectors[0 : len(cmds)-1]
Expand Down

0 comments on commit 571adeb

Please sign in to comment.