init commit with init add subcommands

This commit is contained in:
qwjyh 2024-06-07 18:07:10 +09:00
commit b3c9013f75
8 changed files with 452 additions and 0 deletions

10
.gitignore vendored Normal file
View file

@ -0,0 +1,10 @@
# python generated files
__pycache__/
*.py[oc]
build/
dist/
wheels/
*.egg-info
# venv
.venv

1
.python-version Normal file
View file

@ -0,0 +1 @@
3.12.2

18
README.md Normal file
View file

@ -0,0 +1,18 @@
# browser-history-merger
Merge browser histories into a single database.
# Usage
## Initialization
For the first execution on each device and browser, do
```sh
browser-history-merger path/to/merged.db init browser-id path/to/browser/history/database
```
`browser-id` should be unique to identify browser and machine.
## Add histories
Then add histories to the database by
```sh
browser-history-merger path/to/merged.db add browser-id
```

28
pyproject.toml Normal file
View file

@ -0,0 +1,28 @@
[project]
name = "browser-history-merger"
version = "0.1.0"
description = "Simple tool to merge browser histories into a single database"
authors = [
{ name = "qwjyh", email = "urataw421@gmail.com" }
]
dependencies = []
readme = "README.md"
license = { text = "MIT License" }
requires-python = ">= 3.8"
[project.scripts]
"browser-history-merger" = "browser_history_merger:main"
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
[tool.rye]
managed = true
dev-dependencies = []
[tool.hatch.metadata]
allow-direct-references = true
[tool.hatch.build.targets.wheel]
packages = ["src/browser_history_merger"]

10
requirements-dev.lock Normal file
View file

@ -0,0 +1,10 @@
# generated by rye
# use `rye lock` or `rye sync` to update this lockfile
#
# last locked with the following flags:
# pre: false
# features: []
# all-features: false
# with-sources: false
-e file:.

10
requirements.lock Normal file
View file

@ -0,0 +1,10 @@
# generated by rye
# use `rye lock` or `rye sync` to update this lockfile
#
# last locked with the following flags:
# pre: false
# features: []
# all-features: false
# with-sources: false
-e file:.

View file

@ -0,0 +1,371 @@
#!/usr/bin/env python
import argparse
import logging
import socket
import sqlite3
from typing import Literal
def init_db(
root_con: sqlite3.Connection, root_cur: sqlite3.Cursor, args: argparse.Namespace
):
print("Initialize db")
logging.info("Initializing db")
# Create db
res = root_cur.execute(
"""
SELECT
*
FROM
sqlite_master
WHERE
type = 'table' AND name='browsers'
"""
)
if res.fetchone() is None:
print("Creating root db")
root_cur.execute(
"""
CREATE TABLE browsers (
id INTEGER PRIMARY KEY,
name LONGVARCHAR NOT NULL UNIQUE,
hostname LONGVARCHAR,
visits_time_max INTEGER NOT NULL,
database_path LONGVARCHAR NOT NULL
)
"""
)
root_cur.execute(
"""
CREATE TABLE urls (
id INTEGER,
browser INTEGER NOT NULL,
original_id INTEGER,
url LONGVARCHAR,
title LONGVARCHAR,
PRIMARY KEY("id" AUTOINCREMENT),
FOREIGN KEY("browser") REFERENCES "browsers"("id")
)
"""
)
# `visits` table
# - id: visits id
# - browser:
# - url:
# - title: urls.title at the time when the `add` is executed
# - visit_time: usec with chromium offset
root_cur.execute(
"""
CREATE TABLE visits (
id INTEGER,
browser INTEGER NOT NULL,
original_id INTEGER,
url_id INTEGER NOT NULL,
url LONGVARCHAR NOT NULL,
title LONGVARCHAR,
visit_time INTEGER NOT NULL,
from_visit INTEGER,
transition_qualifier INTEGER DEFAULT 0,
transition_type INTEGER,
PRIMARY KEY("id" AUTOINCREMENT),
FOREIGN KEY("browser") REFERENCES "browsers"("id")
FOREIGN KEY("transition_type") REFERENCES "transition_type"("id")
)
"""
)
# `transition_type`
root_cur.execute(
"""
CREATE TABLE transition_type (
id INTEGER NOT NULL,
name LONGVARCHAR,
PRIMARY KEY("id")
)
"""
)
visit_types = [
(1, "link"),
(2, "typed"),
(3, "auto_bookmark"),
(4, "auto_subframe"),
(5, "manual_subframe"),
(6, "generated"),
(7, "auto_toplevel"),
(8, "form_submit"),
(9, "reload"),
(10, "keyword"),
(11, "keyword_generated"),
(12, "redirect_permanent"),
(13, "redirect_temporary"),
(14, "download"),
(0, "unknown"),
]
root_cur.executemany(
"""
INSERT INTO transition_type VALUES(?, ?)
""",
visit_types,
)
root_con.commit()
res = root_cur.execute(
"""
SELECT
browsers.name
FROM
browsers
WHERE
browsers.name = (?)
""",
[args.name]
)
if res.fetchone() is not None:
print(f"The name {args.name} is already used")
raise ValueError("The provided name for the browser is already used")
root_cur.execute(
"""
INSERT INTO browsers VALUES(NULL, ?, ?, 0, ?)
""",
[args.name, socket.gethostname(), args.database],
)
root_con.commit()
def get_db_type(cur: sqlite3.Cursor) -> Literal["firefox", "chromium"]:
res = cur.execute(
"""
SELECT
*
FROM
sqlite_master
WHERE
type='table' AND name='urls'
"""
)
db_type = "firefox" if res.fetchone() is None else "chromium"
return db_type
def get_browser_info(root_cur: sqlite3.Cursor, name: str) -> tuple[int, int, str]:
res = root_cur.execute(
"""
SELECT
id,
visits_time_max,
database_path
FROM
browsers
WHERE
browsers.name = (?)
""",
(name,),
)
browser_id, visits_time_max, database_path = res.fetchone()
return (browser_id, visits_time_max, database_path)
def convert_chromium_transition_type(transition_qualifier: int) -> int:
"""
Convert transition qualifier of chromium to transition type id defined in doc.
"""
match transition_qualifier % 0x100:
case x if 0 <= x <= 10:
return x + 1
case _:
return 0 # unknown
def convert_firefox_transition_type(transition_type: int) -> int:
"""
Convert `visit_type` of chromium to transition type id defined in doc.
"""
match transition_type:
case x if 1 <= x <= 4:
return x
case 8:
return 5
case 9:
return 9
case 5:
return 12
case 6:
return 13
case 7:
return 14
case _:
return 0
def add_db(
root_con: sqlite3.Connection, root_cur: sqlite3.Cursor, args: argparse.Namespace
):
print("Add history to root db")
browser_id, visits_time_max, database_path = get_browser_info(root_cur, args.name)
logging.info(f"{browser_id=}, {visits_time_max=}")
logging.info(f"Source: {database_path}")
logging.info(f"Root: {args.root_db}")
dburi = f"file:{database_path}?mode=ro&nolock=1"
logging.info(f"DB uri: {dburi}")
con = sqlite3.connect(dburi, uri=True)
cur = con.cursor()
db_type = get_db_type(cur)
logging.info(f"DB type: {db_type}")
match db_type:
case "firefox":
logging.error("Not implemented")
raise RuntimeError("Not implemented")
case "chromium":
select_url_toupdate_sql = """
SELECT
urls.id,
urls.url,
urls.title
FROM
visits,
urls
WHERE
visits.visit_time > (?)
AND visits.url = urls.id
"""
select_visit_sql = """
SELECT
visits.id,
visits.url,
urls.url,
urls.title,
visits.visit_time,
visits.from_visit,
visits.transition
FROM
visits,
urls
WHERE
visits.visit_time > (?)
AND visits.url = urls.id
"""
convert_transition_type = convert_chromium_transition_type
convert_transition_qualifier = lambda x: x
res = cur.execute(select_url_toupdate_sql, [visits_time_max])
updating_urls = (
(
browser_id,
id,
url,
title,
)
for id, url, title in res
)
root_cur.executemany(
"""
REPLACE INTO urls
VALUES(NULL, ?, ?, ?, ?)
""",
updating_urls,
)
print(f"Wrote {root_cur.rowcount} urls")
root_con.commit()
print(f"Wrote {root_cur.rowcount} urls")
logging.info("updated urls in new visits")
res = cur.execute(select_visit_sql, [visits_time_max])
new_visits = (
(
browser_id,
id,
url_id,
url,
title,
visit_time,
from_visit,
convert_transition_qualifier(transition),
convert_transition_type(transition),
)
for id, url_id, url, title, visit_time, from_visit, transition in res
)
root_cur.executemany(
"""
INSERT INTO visits
VALUES(NULL, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""",
new_visits,
)
print(f"Wrote {root_cur.rowcount} visits")
root_con.commit()
print(f"Wrote {root_cur.rowcount} visits")
logging.info("added new visits")
# update visits_time_max
res = root_cur.execute(
"""
SELECT
max(visits.visit_time)
FROM
visits
WHERE
visits.browser = (?)
""",
[browser_id],
)
(new_urls_time_max,) = res.fetchone()
logging.info(f"{new_urls_time_max=}")
root_cur.execute(
"""
UPDATE
browsers
SET
visits_time_max = (?)
WHERE
browsers.id = (?)
""",
(new_urls_time_max, browser_id),
)
root_con.commit()
logging.info("Updated browser information")
def main() -> int:
parser = argparse.ArgumentParser(description="Browser history merger")
parser.add_argument("root_db", help="Merged database path")
parser.add_argument(
"-v", "--verbosity", action="count", default=0, help="Increase log verbosity"
)
subparsers = parser.add_subparsers()
parser_init = subparsers.add_parser("init", help="Initialize root db")
parser_init.add_argument("name", help="Unique name for the browser")
parser_init.add_argument("database", help="Path to the browser's history db")
parser_init.set_defaults(func=init_db)
parse_add = subparsers.add_parser("add", help="Add history to root db")
# parse_add.add_argument("db", help="Source db file")
parse_add.add_argument(
"name", help="Source browser name(which was added to root db before)"
)
parse_add.set_defaults(func=add_db)
args = parser.parse_args()
match args.verbosity:
case 0:
logging.basicConfig(level=logging.WARN)
case 1:
logging.basicConfig(level=logging.INFO)
case _:
logging.basicConfig(level=logging.DEBUG)
logging.debug(f"{args=}")
root_db_path = args.root_db
root_con = sqlite3.connect(root_db_path)
root_cur = root_con.cursor()
if not hasattr(args, "func"):
parser.print_help()
return 1
args.func(root_con, root_cur, args)
return 0
if __name__ == "__main__":
main()

View file

@ -0,0 +1,4 @@
import browser_history_merger
import sys
sys.exit(browser_history_merger.main())