Skip to content

Commit

Permalink
refactor: Migrate ePub generation to go-epub (#679)
Browse files Browse the repository at this point in the history
  • Loading branch information
Monirzadeh committed Sep 16, 2023
1 parent 4df7e1d commit 9e91029
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 270 deletions.
3 changes: 3 additions & 0 deletions go.mod
Expand Up @@ -11,6 +11,7 @@ require (
github.com/gin-contrib/requestid v0.0.6
github.com/gin-contrib/static v0.0.1
github.com/gin-gonic/gin v1.9.1
github.com/go-shiori/go-epub v1.2.0
github.com/go-shiori/go-readability v0.0.0-20230421032831-c66949dfc0ad
github.com/go-shiori/warc v0.0.0-20200621032813-359908319d1d
github.com/go-sql-driver/mysql v1.7.1
Expand Down Expand Up @@ -57,6 +58,7 @@ require (
github.com/go-playground/validator/v10 v10.15.3 // indirect
github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c // indirect
github.com/goccy/go-json v0.10.2 // indirect
github.com/gofrs/uuid/v5 v5.0.0 // indirect
github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f // indirect
github.com/google/uuid v1.3.1 // indirect
github.com/hashicorp/errwrap v1.1.0 // indirect
Expand All @@ -81,6 +83,7 @@ require (
github.com/tdewolff/parse v2.3.4+incompatible // indirect
github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
github.com/ugorji/go/codec v1.2.11 // indirect
github.com/vincent-petithory/dataurl v1.0.0 // indirect
go.etcd.io/bbolt v1.3.7 // indirect
go.uber.org/atomic v1.11.0 // indirect
golang.org/x/arch v0.5.0 // indirect
Expand Down
4 changes: 4 additions & 0 deletions go.sum
Expand Up @@ -87,6 +87,8 @@ github.com/go-playground/validator/v10 v10.15.3/go.mod h1:9iXMNT7sEkjXb0I+enO7QX
github.com/go-shiori/dom v0.0.0-20190930082056-9d974a4f8b25/go.mod h1:360KoNl36ftFYhjLHuEty78kWUGw8i1opEicvIDLfRk=
github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c h1:wpkoddUomPfHiOziHZixGO5ZBS73cKqVzZipfrLmO1w=
github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c/go.mod h1:oVDCh3qjJMLVUSILBRwrm+Bc6RNXGZYtoh9xdvf1ffM=
github.com/go-shiori/go-epub v1.2.0 h1:c2b3DblHpNIiD8ISlQ+0Mc/tsRmn1mX1l6Q/0LzavN4=
github.com/go-shiori/go-epub v1.2.0/go.mod h1:gQCqrK+dIMLA7JMd8GxdBvhn811wb7XCa733RxWfPYw=
github.com/go-shiori/go-readability v0.0.0-20230421032831-c66949dfc0ad h1:3VP5Q8Mh165h2DHmXWFT4LJlwwvgTRlEuoe2vnsVnJ4=
github.com/go-shiori/go-readability v0.0.0-20230421032831-c66949dfc0ad/go.mod h1:2DpZlTJO/ycxp/vsc/C11oUyveStOgIXB88SYV1lncI=
github.com/go-shiori/warc v0.0.0-20200621032813-359908319d1d h1:+SEf4hYDaAt2eyq8Xu3YyWCpnMsK8sZfbYsDRFCUgBM=
Expand Down Expand Up @@ -258,6 +260,8 @@ github.com/ugorji/go/codec v1.1.7/go.mod h1:Ax+UKWsSmolVDwsd+7N3ZtXu+yMGCf907BLY
github.com/ugorji/go/codec v1.2.7/go.mod h1:WGN1fab3R1fzQlVQTkfxVtIBhWDRqOviHU95kRgeqEY=
github.com/ugorji/go/codec v1.2.11 h1:BMaWp1Bb6fHwEtbplGBGJ498wD+LKlNSl25MjdZY4dU=
github.com/ugorji/go/codec v1.2.11/go.mod h1:UNopzCgEMSXjBc6AOMqYvWC1ktqTAfzJZUZgYf6w6lg=
github.com/vincent-petithory/dataurl v1.0.0 h1:cXw+kPto8NLuJtlMsI152irrVw9fRDX8AbShPRpg2CI=
github.com/vincent-petithory/dataurl v1.0.0/go.mod h1:FHafX5vmDzyP+1CQATJn7WFKc9CvnvxyvZy6I1MrG/U=
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
go.etcd.io/bbolt v1.3.3/go.mod h1:IbVyRI1SCnLcuJnV2u8VeU0CEYM7e686BmAb1XKL+uU=
go.etcd.io/bbolt v1.3.7 h1:j+zJOnnEjF/kyHlDDgGnVL/AIqIJPq8UoB2GSNfkUfQ=
Expand Down
218 changes: 19 additions & 199 deletions internal/core/ebook.go
@@ -1,17 +1,13 @@
package core

import (
"archive/zip"
"fmt"
"io"
"log"
"net/http"
"os"
fp "path/filepath"
"regexp"
"strconv"
"strings"

epub "github.com/go-shiori/go-epub"
"github.com/go-shiori/shiori/internal/model"
"github.com/pkg/errors"
)
Expand All @@ -20,8 +16,6 @@ import (
// The destination path `dstPath` should include file name with ".epub" extension
// The bookmark model will be used to update the UI based on whether this function is successful or not.
func GenerateEbook(req ProcessRequest, dstPath string) (book model.Bookmark, err error) {
// variable for store generated html code
var html string

book = req.Bookmark

Expand All @@ -30,8 +24,7 @@ func GenerateEbook(req ProcessRequest, dstPath string) (book model.Bookmark, err
return book, errors.New("bookmark ID is not valid")
}

// get current state of bookmark
// cheak archive and thumb
// Get current state of bookmark cheak archive and thumb
strID := strconv.Itoa(book.ID)

imagePath := fp.Join(req.DataDir, "thumb", fmt.Sprintf("%d", book.ID))
Expand All @@ -45,192 +38,45 @@ func GenerateEbook(req ProcessRequest, dstPath string) (book model.Bookmark, err
book.HasArchive = true
}

// this function create ebook from reader mode of bookmark so
// This function create ebook from reader mode of bookmark so
// we can't create ebook from PDF so we return error here if bookmark is a pdf
contentType := req.ContentType
if strings.Contains(contentType, "application/pdf") {
return book, errors.New("can't create ebook for pdf")
}

// create temporary epub file
// Create temporary epub file
tmpFile, err := os.CreateTemp("", "ebook")
if err != nil {
return book, errors.Wrap(err, "can't create temporary EPUB file")
}
defer os.Remove(tmpFile.Name())

// Create zip archive
epubWriter := zip.NewWriter(tmpFile)
// Create last line of ebook
lastline := `<hr/><p style="text-align:center">Generated By <a href="https://github.com/go-shiori/shiori">Shiori</a> From <a href="` + book.URL + `">This Page</a></p>`

// Create the mimetype file
mimetypeWriter, err := epubWriter.Create("mimetype")
// Create ebook
ebook, err := epub.NewEpub(book.Title)
if err != nil {
return book, errors.Wrap(err, "can't create mimetype")
}
_, err = mimetypeWriter.Write([]byte("application/epub+zip"))
if err != nil {
return book, errors.Wrap(err, "can't write into mimetype file")
}

// Create the container.xml file
containerWriter, err := epubWriter.Create("META-INF/container.xml")
if err != nil {
return book, errors.Wrap(err, "can't create container.xml")
}

_, err = containerWriter.Write([]byte(`<?xml version="1.0" encoding="UTF-8"?>
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
<rootfiles>
<rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
</rootfiles>
</container>`))
if err != nil {
return book, errors.Wrap(err, "can't write into container.xml file")
}

contentOpfWriter, err := epubWriter.Create("OEBPS/content.opf")
if err != nil {
return book, errors.Wrap(err, "can't create content.opf")
}
_, err = contentOpfWriter.Write([]byte(`<?xml version="1.0" encoding="UTF-8"?>
<package xmlns="http://www.idpf.org/2007/opf" version="2.0" unique-identifier="BookId">
<metadata>
<dc:title>` + book.Title + `</dc:title>
</metadata>
<manifest>
<item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml"/>
<item id="content" href="content.html" media-type="application/xhtml+xml"/>
<item id="id" href="../style.css" media-type="text/css"/>
</manifest>
<spine toc="ncx">
<itemref idref="content"/>
</spine>
</package>`))
if err != nil {
return book, errors.Wrap(err, "can't write into container.opf file")
return book, errors.Wrap(err, "can't create EPUB")
}

// Create the style.css file
styleWriter, err := epubWriter.Create("style.css")
if err != nil {
return book, errors.Wrap(err, "can't create content.xml")
}
_, err = styleWriter.Write([]byte(`content {
display: block;
font-size: 1em;
line-height: 1.2;
padding-left: 0;
padding-right: 0;
text-align: justify;
margin: 0 5pt
}
img {
margin: auto;
display: block;
}`))
ebook.SetTitle(book.Title)
ebook.SetAuthor(book.Author)
ebook.SetDescription(book.Excerpt)
_, err = ebook.AddSection(`<h1 style="text-align:center"> `+book.Title+` </h1>`+book.HTML+lastline, book.Title, "", "")
if err != nil {
return book, errors.Wrap(err, "can't write into style.css file")
return book, errors.Wrap(err, "can't add ebook Section")
}
// Create the toc.ncx file
tocNcxWriter, err := epubWriter.Create("OEBPS/toc.ncx")
ebook.EmbedImages()
err = ebook.Write(tmpFile.Name())
if err != nil {
return book, errors.Wrap(err, "can't create toc.ncx")
return book, errors.Wrap(err, "can't create ebook file")
}
_, err = tocNcxWriter.Write([]byte(`<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN"
"http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">
<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
<head>
<meta name="dtb:uid" content="urn:uuid:12345678-1234-5678-1234-567812345678"/>
<meta name="dtb:depth" content="1"/>
<meta name="dtb:totalPageCount" content="0"/>
<meta name="dtb:maxPageNumber" content="0"/>
</head>
<docTitle>
<text>` + book.Title + `</text>
</docTitle>
<navMap>
<navPoint id="navPoint-1" playOrder="1">
<navLabel>
<text >` + book.Title + `</text>
</navLabel>
<content src="content.html"/>
</navPoint>
</navMap>
</ncx>`))
if err != nil {
return book, errors.Wrap(err, "can't write into toc.ncx file")
}

// get list of images tag in html
imageList, _ := GetImages(book.HTML)
imgRegex := regexp.MustCompile(`<img.*?src="([^"]*)".*?>`)

// Create a set to store unique image URLs
imageSet := make(map[string]bool)

// Download image in html file and generate new html
html = book.HTML
for _, match := range imgRegex.FindAllStringSubmatch(book.HTML, -1) {
imageURL := match[1]
if _, ok := imageList[imageURL]; ok && !imageSet[imageURL] {
// Add the image URL to the set
imageSet[imageURL] = true

// Download the image
resp, err := http.Get(imageURL)
if err != nil {
log.Fatal(err)
}
defer resp.Body.Close()

// Get the image data
imageData, err := io.ReadAll(resp.Body)
if err != nil {
return book, errors.Wrap(err, "can't get image from the internet")
}

fileName := fp.Base(imageURL)
filePath := "images/" + fileName
imageWriter, err := epubWriter.Create(filePath)
if err != nil {
log.Fatal(err)
}

// Write the image to the file
_, err = imageWriter.Write(imageData)
if err != nil {
return book, errors.Wrap(err, "can't create image file")
}
// Replace the image tag with the new downloaded image
html = strings.ReplaceAll(html, match[0], fmt.Sprintf(`<img src="../%s"/>`, filePath))
}
}
// Create the content.html file
contentHtmlWriter, err := epubWriter.Create("OEBPS/content.html")
if err != nil {
return book, errors.Wrap(err, "can't create content.xml")
}
_, err = contentHtmlWriter.Write([]byte("<?xml version='1.0' encoding='utf-8'?>\n<html xmlns=\"http://www.w3.org/1999/xhtml\">\n<head>\n\t<title>" + book.Title + "</title>\n\t<link href=\"../style.css\" rel=\"stylesheet\" type=\"text/css\"/>\n</head>\n<body>\n\t<h1 dir=\"auto\">" + book.Title + "</h1>" + "\n<content dir=\"auto\">\n" + html + "\n</content>" + "\n</body></html>"))
if err != nil {
return book, errors.Wrap(err, "can't write into content.html")
}
// close epub and tmpFile
err = epubWriter.Close()
if err != nil {
return book, errors.Wrap(err, "failed to close EPUB writer")
}
err = tmpFile.Close()
if err != nil {
return book, errors.Wrap(err, "failed to close temporary EPUB file")
}
// open temporary file again
tmpFile, err = os.Open(tmpFile.Name())
if err != nil {
return book, errors.Wrap(err, "can't open temporary EPUB file")
}
defer tmpFile.Close()
// if everitings go well we start move ebook to dstPath

// If everything go well we move ebook to dstPath
err = MoveFileToDestination(dstPath, tmpFile)
if err != nil {
return book, errors.Wrap(err, "failed move ebook to destination")
Expand All @@ -239,29 +85,3 @@ img {
book.HasEbook = true
return book, nil
}

// function get html and return list of image url inside html file
func GetImages(html string) (map[string]string, error) {
// Regular expression to match image tags and their URLs
imageTagRegex := regexp.MustCompile(`<img.*?src="(.*?)".*?>`)

// Find all matches in the HTML string
imageTagMatches := imageTagRegex.FindAllStringSubmatch(html, -1)
// Create a dictionary to store the image URLs
images := make(map[string]string)

// Check if there are any matches
if len(imageTagMatches) == 0 {
return nil, nil
}

// Loop through all the matches and add them to the dictionary
for _, match := range imageTagMatches {
imageURL := match[1]
if !strings.HasPrefix(imageURL, "data:image/") {
images[imageURL] = match[0]
}
}

return images, nil
}
71 changes: 0 additions & 71 deletions internal/core/ebook_test.go
Expand Up @@ -171,74 +171,3 @@ func TestGenerateEbook(t *testing.T) {
})
})
}

// Add more unit tests for other scenarios that missing specialy
// can't create ebook directory and can't write situatuin
// writing inside zip file
// html variable that not export and image download loop

func TestGetImages(t *testing.T) {
// Test case 1: HTML with no image tags
html1 := `<html><body><h1>Hello, World!</h1></body></html>`
expected1 := make(map[string]string)
result1, err1 := core.GetImages(html1)
if err1 != nil {
t.Errorf("Unexpected error: %v", err1)
}
if len(result1) != len(expected1) {
t.Errorf("Expected %d images, but got %d", len(expected1), len(result1))
}

// Test case 2: HTML with one image tag
html2 := `<html><body><img src="image1.jpg"></body></html>`
expected2 := map[string]string{"image1.jpg": "<img src=\"image1.jpg\">"}
result2, err2 := core.GetImages(html2)
if err2 != nil {
t.Errorf("Unexpected error: %v", err2)
}
if len(result2) != len(expected2) {
t.Errorf("Expected %d images, but got %d", len(expected2), len(result2))
}
for key, value := range expected2 {
if result2[key] != value {
t.Errorf("Expected image URL %s with tag %s, but got %s", key, value, result2[key])
}
}

// Test case 3: HTML with multiple image tags
html3 := `<html><body><img src="image1.jpg"><img src="image2.jpg"></body></html>`
expected3 := map[string]string{
"image1.jpg": "<img src=\"image1.jpg\">",
"image2.jpg": "<img src=\"image2.jpg\">",
}
result3, err3 := core.GetImages(html3)
if err3 != nil {
t.Errorf("Unexpected error: %v", err3)
}
if len(result3) != len(expected3) {
t.Errorf("Expected %d images, but got %d", len(expected3), len(result3))
}
for key, value := range expected3 {
if result3[key] != value {
t.Errorf("Expected image URL %s with tag %s, but got %s", key, value, result3[key])
}
}
// Test case 4: HTML with multiple image tags with duplicayr
html4 := `<html><body><img src="image1.jpg"><img src="image2.jpg"><img src="image2.jpg"></body></html>`
expected4 := map[string]string{
"image1.jpg": "<img src=\"image1.jpg\">",
"image2.jpg": "<img src=\"image2.jpg\">",
}
result4, err4 := core.GetImages(html4)
if err4 != nil {
t.Errorf("Unexpected error: %v", err4)
}
if len(result4) != len(expected4) {
t.Errorf("Expected %d images, but got %d", len(expected4), len(result4))
}
for key, value := range expected4 {
if result4[key] != value {
t.Errorf("Expected image URL %s with tag %s, but got %s", key, value, result4[key])
}
}
}

0 comments on commit 9e91029

Please sign in to comment.