Skip to content

Commit

Permalink
doc: A lot of images skipped from Google Photos Takeout #68
Browse files Browse the repository at this point in the history
  • Loading branch information
simulot committed Dec 1, 2023
2 parents 6bd6067 + 3ad5ed9 commit c1ac8a7
Show file tree
Hide file tree
Showing 6 changed files with 149 additions and 110 deletions.
173 changes: 90 additions & 83 deletions browser/gp/googlephotos.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,17 @@ import (

type Takeout struct {
fsys fs.FS
filesByDir map[string][]fileKey // files name mapped by dir
filesByDir map[string][]fileReference // files name mapped by dir
jsonByYear map[jsonKey]*GoogleMetaData // assets by year of capture and full path
albumsByDir map[string]browser.LocalAlbum // album title mapped by dir
log logger.Logger
conf *browser.Configuration
}

type fileReference struct {
fileKey
taken bool // True, when the file as been associated to a json and sent to the uploader
}
type fileKey struct {
name string
size int64
Expand All @@ -40,7 +44,7 @@ type Album struct {
func NewTakeout(ctx context.Context, fsys fs.FS, log logger.Logger, conf *browser.Configuration) (*Takeout, error) {
to := Takeout{
fsys: fsys,
filesByDir: map[string][]fileKey{},
filesByDir: map[string][]fileReference{},
jsonByYear: map[jsonKey]*GoogleMetaData{},
albumsByDir: map[string]browser.LocalAlbum{},
log: log,
Expand Down Expand Up @@ -119,7 +123,7 @@ func (to *Takeout) walk(ctx context.Context, fsys fs.FS) error {
if err != nil {
return err
}
key := fileKey{name: base, size: info.Size()}
key := fileReference{fileKey: fileKey{name: base, size: info.Size()}}
l := to.filesByDir[dir]
l = append(l, key)
to.filesByDir[dir] = l
Expand All @@ -130,7 +134,42 @@ func (to *Takeout) walk(ctx context.Context, fsys fs.FS) error {
return err
}

type matcherFn func(jsonName string, fileName string) bool

// matchers is a list of matcherFn from the most likely to be used to the least one
var matchers = []matcherFn{
normalMatch,
matchWithOneCharOmitted,
matchVeryLongNameWithNumber,
matchDuplicateInYear,
matchEditedName,
matchForgottenDuplicates,
}

// Browse gives back to the main program the list of assets with resolution of file name, album, dates...
//
// JSON files give important information about the relative photos / movies:
// - The original name (useful when it as been truncated)
// - The date of capture (useful when the files doesn't have this date)
// - The GPS coordinates (will be useful in a future release)
//
// Each JSON is checked. JSON is duplicated in albums folder.
// Associated files with the JSON can be found in the JSON's folder, or in the Year photos.
// Once associated and sent to the main program, files are tagged for not been associated with an other one JSON.
// Association is done with the help of a set of matcher functions. Each one implement a rule
//
// 1 JSON can be associated with 1+ files that have a part of their name in common.
// - the file is named after the JSON name
// - the file name can be 1 UTF-16 char shorter (🤯) than the JSON name
// - the file name is longer than 46 UTF-16 chars (🤯) is truncated. But the truncation can creates duplicates, then a number is added.
// - if there are several files with same original name, the first instance kept as it is, the next have a a sequence number.
// File is renamed as IMG_1234(1).JPG and the JSON is renamed as IMG_1234.JPG(1).JSON
// - of course those rules are likely to collide. They have to be applied from the most common to the least one.
// - sometimes the file isn't in the same folder than the json... It can be found in Year's photos folder
//
// The duplicates files (same name, same length in bytes) found in the local source are discarded before been presented to the immich server.
//

func (to *Takeout) Browse(ctx context.Context) chan *browser.LocalAssetFile {
c := make(chan *browser.LocalAssetFile)
passed := map[fileKey]any{}
Expand All @@ -142,53 +181,59 @@ func (to *Takeout) Browse(ctx context.Context) chan *browser.LocalAssetFile {
return to.jsonByYear[jsonFile[i]].foundInPaths[0] < to.jsonByYear[jsonFile[j]].foundInPaths[0]
})

for _, k := range jsonFile {
md := to.jsonByYear[k]
to.log.Debug("Checking '%s', %d", k.name, k.year)
assets := to.jsonAssets(k, md)

for _, a := range assets {
ext := path.Ext(a.FileName)
if !to.conf.SelectExtensions.Include(ext) {
to.conf.Journal.AddEntry(a.FileName, journal.DISCARDED, "because of select-type option")
continue
}
if to.conf.ExcludeExtensions.Exclude(ext) {
to.conf.Journal.AddEntry(a.FileName, journal.DISCARDED, "because of exclude-type option")
continue
}
fk := fileKey{name: path.Base(a.FileName), size: int64(a.FileSize)}
if _, exist := passed[fk]; !exist {
passed[fk] = nil
select {
case <-ctx.Done():
return
default:
c <- a
// For the most common matcher to the least,
// Check files that match each json files
for _, matcher := range matchers {
for _, k := range jsonFile {
md := to.jsonByYear[k]
assets := to.jsonAssets(k, md, matcher)

for _, a := range assets {
to.conf.Journal.AddEntry(a.FileName, journal.JSON, k.name)
ext := path.Ext(a.FileName)
if !to.conf.SelectExtensions.Include(ext) {
to.conf.Journal.AddEntry(a.FileName, journal.DISCARDED, "because of select-type option")
continue
}
if to.conf.ExcludeExtensions.Exclude(ext) {
to.conf.Journal.AddEntry(a.FileName, journal.DISCARDED, "because of exclude-type option")
continue
}
fk := fileKey{name: path.Base(a.FileName), size: int64(a.FileSize)}
if _, exist := passed[fk]; !exist {
passed[fk] = nil
select {
case <-ctx.Done():
return
default:
c <- a
}
} else {
to.conf.Journal.AddEntry(a.FileName, journal.LOCAL_DUPLICATE, fk.name)
}
} else {
to.conf.Journal.AddEntry(a.FileName, journal.LOCAL_DUPLICATE, fk.name)
}
}
}

leftOver := 0
for _, l := range to.filesByDir {
for _, f := range l {
if !f.taken {
leftOver++
}
}
}
to.log.Error("%d files left over", leftOver)

}()

return c

}

// jsonAssets search assets that are linked to this JSON
//
// the asset is named after the JSON name
// the asset name can be 1 char shorter than the JSON name
// but several assets can match with the JSON 🤯
// the asset can be placed in another folder than the JSON
// when the JSON is found in an album dir, the asset belongs to the album
// but the image can be found in year's folder 🤯
// the asset name is the JSON title field
// When there are more thant one asset, asset names must be derived from the json title.
// jsonAssets search assets that are linked to this JSON using the given matcher

func (to *Takeout) jsonAssets(key jsonKey, md *GoogleMetaData) []*browser.LocalAssetFile {
func (to *Takeout) jsonAssets(key jsonKey, md *GoogleMetaData, matcher matcherFn) []*browser.LocalAssetFile {

var list []*browser.LocalAssetFile

Expand All @@ -203,27 +248,17 @@ func (to *Takeout) jsonAssets(key jsonKey, md *GoogleMetaData) []*browser.LocalA
}
}
if !jsonInYear {
// add the Year folder to the list to search files there as well
// TODO: is it needed for real archives?
paths = append(paths, yearDir)
}

// Search for the assets in folders where the JSON has been found
for _, d := range paths {
l := to.filesByDir[d]

for _, f := range l {

matched := normalMatch(key.name, f.name)
matched = matched || matchWithOneCharOmitted(key.name, f.name)
matched = matched || matchVeryLongNameWithNumber(key.name, f.name)
matched = matched || matchDuplicateInYear(key.name, f.name)
matched = matched || matchEditedName(key.name, f.name)
matched = matched || matchMPNames(key.name, f.name)
// matched = matched || matchForgottenDuplicates(key.name, f.name)

if matched {
for i, f := range l {
if !f.taken && matcher(key.name, f.name) {
list = append(list, to.copyGoogleMDToAsset(md, path.Join(d, f.name), int(f.size)))
l[i].taken = true
}
}
}
Expand Down Expand Up @@ -325,41 +360,14 @@ func matchEditedName(jsonName string, fileName string) bool {
if ext != "" {
if _, err := fshelper.MimeFromExt(ext); err == nil {
base := strings.TrimSuffix(base, ext)
fileName = strings.TrimSuffix(fileName, path.Ext(fileName))
return strings.HasPrefix(fileName, base)
}
}
return false
}

// matchMPNames
// PXL_20221228_185930354.MP.jpg.json
// PXL_20221228_185930354.MP
// PXL_20221228_185930354.MP.jpg

func matchMPNames(jsonName string, fileName string) bool {
base := strings.TrimSuffix(jsonName, path.Ext(jsonName))
fileExt := strings.ToLower(path.Ext(fileName))
if fileExt != ".mp" {
return false
}
ext := path.Ext(base)
if ext != "" {
if _, err := fshelper.MimeFromExt(ext); err == nil {
base := strings.TrimSuffix(base, ext)
// fileName = strings.TrimSuffix(fileName, path.Ext(fileName))
if strings.HasPrefix(fileName, base) {
return true
}
base = strings.TrimSuffix(base, path.Ext(base))
return strings.HasPrefix(fileName, base)
fname := strings.TrimSuffix(fileName, path.Ext(fileName))
return strings.HasPrefix(fname, base)
}
}
return false
}

/*
TODO: This one interferes with matchVeryLongNameWithNumber
//TODO: This one interferes with matchVeryLongNameWithNumber

// matchForgottenDuplicates
// original_1d4caa6f-16c6-4c3d-901b-9387de10e528_.json
Expand All @@ -377,7 +385,6 @@ func matchForgottenDuplicates(jsonName string, fileName string) bool {
}
return false
}
*/

func (to *Takeout) copyGoogleMDToAsset(md *GoogleMetaData, filename string, length int) *browser.LocalAssetFile {
// Change file's title with the asset's title and the actual file's extension
Expand Down
39 changes: 34 additions & 5 deletions browser/gp/googlephotos_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,11 @@ func Test_matchEditedName(t *testing.T) {
fileName: "DSC_0238.JPG",
want: true,
},
{
jsonName: "DSC_0238.JPG.json",
fileName: "DSC_0238(1).JPG",
want: false,
},
// {
// jsonName: "DSC_0238.JPG.json",
// fileName: "DSC_0238(1).JPG",
// want: false,
// },
}
for _, tt := range tests {
t.Run(tt.fileName, func(t *testing.T) {
Expand Down Expand Up @@ -92,3 +92,32 @@ func Test_matchDuplicateInYear(t *testing.T) {
})
}
}

func Test_matchForgottenDuplicates(t *testing.T) {
tests := []struct {
name string
jsonName string
fileName string
want bool
}{
{
name: "match1",
jsonName: "1556189729458-8d2e2d13-bca5-467e-a242-9e4cb238.json",
fileName: "1556189729458-8d2e2d13-bca5-467e-a242-9e4cb238e.jpg",
want: true,
},
{
name: "match2",
jsonName: "1556189729458-8d2e2d13-bca5-467e-a242-9e4cb238.json",
fileName: "1556189729458-8d2e2d13-bca5-467e-a242-9e4cb238e(1).jpg",
want: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := matchForgottenDuplicates(tt.jsonName, tt.fileName); got != tt.want {
t.Errorf("matchDuplicateInYear() = %v, want %v", got, tt.want)
}
})
}
}
6 changes: 0 additions & 6 deletions browser/gp/testgp_samples_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -155,12 +155,6 @@ func namesWithNumbers() *inMemFS {
addImage("Takeout/Google Photos/Photos from 2009/IMG_3479(2).JPG", 15)
}

// func namesWithNumbersBut1() *inMemFS {
// return newInMemFS().
// addJSONImage("Takeout/Google Photos/Photos from 2009/IMG_3479.JPG.json", "IMG_3479.JPG").
// addImage("Takeout/Google Photos/Photos from 2009/IMG_3479_1.JPG", 10)
// }

func namesTruncated() *inMemFS {
return newInMemFS().
addJSONImage("Takeout/Google Photos/Photos from 2023/😀😃😄😁😆😅😂🤣🥲☺️😊😇🙂🙃😉😌😍🥰😘😗😙😚😋.json", "😀😃😄😁😆😅😂🤣🥲☺️😊😇🙂🙃😉😌😍🥰😘😗😙😚😋😛😝😜🤪🤨🧐🤓😎🥸🤩🥳😏😒😞😔😟😕🙁☹️😣😖😫😩🥺😢😭😤😠😡🤬🤯😳🥵🥶.jpg").
Expand Down
20 changes: 8 additions & 12 deletions browser/gp/testgp_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ func TestBrowse(t *testing.T) {
gen func() *inMemFS
results []fileResult // file name / title
}{

{"simpleYear", simpleYear,
sortFileResult([]fileResult{
{name: "PXL_20230922_144936660.jpg", size: 10, title: "PXL_20230922_144936660.jpg"},
Expand Down Expand Up @@ -76,7 +77,6 @@ func TestBrowse(t *testing.T) {
},
{"issue68MPFiles", issue68MPFiles,
sortFileResult([]fileResult{
{name: "PXL_20221228_185930354.MP", size: 1, title: "PXL_20221228_185930354.MP"},
{name: "PXL_20221228_185930354.MP.jpg", size: 2, title: "PXL_20221228_185930354.MP.jpg"},
}),
},
Expand All @@ -87,17 +87,13 @@ func TestBrowse(t *testing.T) {
}),
},

// {"namesWithNumbersBut1", namesWithNumbersBut1,
// sortFileResult([]fileResult{}),
// },

// {
// "issue68ForgottenDuplicates", issue68ForgottenDuplicates,
// sortFileResult([]fileResult{
// {name: "original_1d4caa6f-16c6-4c3d-901b-9387de10e528_P.jpg", size: 1, title: "original_1d4caa6f-16c6-4c3d-901b-9387de10e528_PXL_20220516_164814158.jpg"},
// {name: "original_1d4caa6f-16c6-4c3d-901b-9387de10e528_P(1).jpg", size: 2, title: "original_1d4caa6f-16c6-4c3d-901b-9387de10e528_PXL_20220516_164814158.jpg"},
// }),
// },
{
"issue68ForgottenDuplicates", issue68ForgottenDuplicates,
sortFileResult([]fileResult{
{name: "original_1d4caa6f-16c6-4c3d-901b-9387de10e528_P.jpg", size: 1, title: "original_1d4caa6f-16c6-4c3d-901b-9387de10e528_PXL_20220516_164814158.jpg"},
{name: "original_1d4caa6f-16c6-4c3d-901b-9387de10e528_P(1).jpg", size: 2, title: "original_1d4caa6f-16c6-4c3d-901b-9387de10e528_PXL_20220516_164814158.jpg"},
}),
},
}
for _, c := range tc {
t.Run(c.name, func(t *testing.T) {
Expand Down
20 changes: 16 additions & 4 deletions docs/releases.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,22 @@

Improvement for the takeout import.
- The log indicate with JSON is associated to an file.
- Fix the JSON association for very long file name and duplicated


## Release 0.8.8
- JSON and files are associated by applying successively rules from the most common to the strangest one

Each JSON is checked. JSON is duplicated in albums folder.<br>
Associated files with the JSON can be found in the JSON's folder, or in the Year photos.<br>

Once associated and sent to the main program, files are tagged for not been associated with an other one JSON.<br>
Association is done with the help of a set of matcher functions. Each one implement a rule<br>
1 JSON can be associated with 1+ files that have a part of their name in common.<br>
- the file is named after the JSON name<br>
- the file name can be 1 UTF-16 char shorter (🤯) than the JSON name<br>
- the file name is longer than 46 UTF-16 chars (🤯) is truncated. But the truncation can creates duplicates, then a number is added.
- if there are several files with same original name, the first instance kept as it is, the next have a a sequence number. File is renamed as IMG_1234(1).JPG and the JSON is renamed as IMG_1234.JPG(1).JSON
- of course those rules are likely to collide. They have to be applied from the most common to the least one.<br>
- sometimes the file isn't in the same folder than the json... It can be found in Year's photos folder<br>
The duplicates files (same name, same length in bytes) found in the local source are discarded before been <br>presented to the immich server.
Release 0.8.8<br>

### fix for #86: unknown time zone Argentina/Buenos_Aires

Expand Down
1 change: 1 addition & 0 deletions journal/journal.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ const (
UNHANDLED Action = "File unhandled"
HANDLED Action = "File handled"
INFO Action = "Info"
JSON Action = "Associated JSON"
)

func NewJournal(log logger.Logger) *Journal {
Expand Down

0 comments on commit c1ac8a7

Please sign in to comment.