From b3c9013f75414eae7a278d480d1ebb0763706b51 Mon Sep 17 00:00:00 2001 From: qwjyh Date: Fri, 7 Jun 2024 18:07:10 +0900 Subject: [PATCH] init commit with `init` `add` subcommands --- .gitignore | 10 + .python-version | 1 + README.md | 18 ++ pyproject.toml | 28 ++ requirements-dev.lock | 10 + requirements.lock | 10 + src/browser_history_merger/__init__.py | 371 +++++++++++++++++++++++++ src/browser_history_merger/__main__.py | 4 + 8 files changed, 452 insertions(+) create mode 100644 .gitignore create mode 100644 .python-version create mode 100644 README.md create mode 100644 pyproject.toml create mode 100644 requirements-dev.lock create mode 100644 requirements.lock create mode 100644 src/browser_history_merger/__init__.py create mode 100644 src/browser_history_merger/__main__.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b2546d8 --- /dev/null +++ b/.gitignore @@ -0,0 +1,10 @@ +# python generated files +__pycache__/ +*.py[oc] +build/ +dist/ +wheels/ +*.egg-info + +# venv +.venv \ No newline at end of file diff --git a/.python-version b/.python-version new file mode 100644 index 0000000..8531a3b --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.12.2 diff --git a/README.md b/README.md new file mode 100644 index 0000000..e8191bd --- /dev/null +++ b/README.md @@ -0,0 +1,18 @@ +# browser-history-merger + +Merge browser histories into a single database. + +# Usage +## Initialization +For the first execution on each device and browser, do +```sh +browser-history-merger path/to/merged.db init browser-id path/to/browser/history/database +``` +`browser-id` should be unique to identify browser and machine. + +## Add histories +Then add histories to the database by +```sh +browser-history-merger path/to/merged.db add browser-id +``` + diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..b489226 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,28 @@ +[project] +name = "browser-history-merger" +version = "0.1.0" +description = "Simple tool to merge browser histories into a single database" +authors = [ + { name = "qwjyh", email = "urataw421@gmail.com" } +] +dependencies = [] +readme = "README.md" +license = { text = "MIT License" } +requires-python = ">= 3.8" + +[project.scripts] +"browser-history-merger" = "browser_history_merger:main" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.rye] +managed = true +dev-dependencies = [] + +[tool.hatch.metadata] +allow-direct-references = true + +[tool.hatch.build.targets.wheel] +packages = ["src/browser_history_merger"] diff --git a/requirements-dev.lock b/requirements-dev.lock new file mode 100644 index 0000000..8f23096 --- /dev/null +++ b/requirements-dev.lock @@ -0,0 +1,10 @@ +# generated by rye +# use `rye lock` or `rye sync` to update this lockfile +# +# last locked with the following flags: +# pre: false +# features: [] +# all-features: false +# with-sources: false + +-e file:. diff --git a/requirements.lock b/requirements.lock new file mode 100644 index 0000000..8f23096 --- /dev/null +++ b/requirements.lock @@ -0,0 +1,10 @@ +# generated by rye +# use `rye lock` or `rye sync` to update this lockfile +# +# last locked with the following flags: +# pre: false +# features: [] +# all-features: false +# with-sources: false + +-e file:. diff --git a/src/browser_history_merger/__init__.py b/src/browser_history_merger/__init__.py new file mode 100644 index 0000000..417052b --- /dev/null +++ b/src/browser_history_merger/__init__.py @@ -0,0 +1,371 @@ +#!/usr/bin/env python +import argparse +import logging +import socket +import sqlite3 +from typing import Literal + + +def init_db( + root_con: sqlite3.Connection, root_cur: sqlite3.Cursor, args: argparse.Namespace +): + print("Initialize db") + logging.info("Initializing db") + + # Create db + res = root_cur.execute( + """ + SELECT + * + FROM + sqlite_master + WHERE + type = 'table' AND name='browsers' + """ + ) + + if res.fetchone() is None: + print("Creating root db") + root_cur.execute( + """ + CREATE TABLE browsers ( + id INTEGER PRIMARY KEY, + name LONGVARCHAR NOT NULL UNIQUE, + hostname LONGVARCHAR, + visits_time_max INTEGER NOT NULL, + database_path LONGVARCHAR NOT NULL + ) + """ + ) + root_cur.execute( + """ + CREATE TABLE urls ( + id INTEGER, + browser INTEGER NOT NULL, + original_id INTEGER, + url LONGVARCHAR, + title LONGVARCHAR, + PRIMARY KEY("id" AUTOINCREMENT), + FOREIGN KEY("browser") REFERENCES "browsers"("id") + ) + """ + ) + # `visits` table + # - id: visits id + # - browser: + # - url: + # - title: urls.title at the time when the `add` is executed + # - visit_time: usec with chromium offset + root_cur.execute( + """ + CREATE TABLE visits ( + id INTEGER, + browser INTEGER NOT NULL, + original_id INTEGER, + url_id INTEGER NOT NULL, + url LONGVARCHAR NOT NULL, + title LONGVARCHAR, + visit_time INTEGER NOT NULL, + from_visit INTEGER, + transition_qualifier INTEGER DEFAULT 0, + transition_type INTEGER, + PRIMARY KEY("id" AUTOINCREMENT), + FOREIGN KEY("browser") REFERENCES "browsers"("id") + FOREIGN KEY("transition_type") REFERENCES "transition_type"("id") + ) + """ + ) + # `transition_type` + root_cur.execute( + """ + CREATE TABLE transition_type ( + id INTEGER NOT NULL, + name LONGVARCHAR, + PRIMARY KEY("id") + ) + """ + ) + visit_types = [ + (1, "link"), + (2, "typed"), + (3, "auto_bookmark"), + (4, "auto_subframe"), + (5, "manual_subframe"), + (6, "generated"), + (7, "auto_toplevel"), + (8, "form_submit"), + (9, "reload"), + (10, "keyword"), + (11, "keyword_generated"), + (12, "redirect_permanent"), + (13, "redirect_temporary"), + (14, "download"), + (0, "unknown"), + ] + root_cur.executemany( + """ + INSERT INTO transition_type VALUES(?, ?) + """, + visit_types, + ) + root_con.commit() + + res = root_cur.execute( + """ + SELECT + browsers.name + FROM + browsers + WHERE + browsers.name = (?) + """, + [args.name] + ) + if res.fetchone() is not None: + print(f"The name {args.name} is already used") + raise ValueError("The provided name for the browser is already used") + root_cur.execute( + """ + INSERT INTO browsers VALUES(NULL, ?, ?, 0, ?) + """, + [args.name, socket.gethostname(), args.database], + ) + root_con.commit() + + +def get_db_type(cur: sqlite3.Cursor) -> Literal["firefox", "chromium"]: + res = cur.execute( + """ + SELECT + * + FROM + sqlite_master + WHERE + type='table' AND name='urls' + """ + ) + db_type = "firefox" if res.fetchone() is None else "chromium" + return db_type + + +def get_browser_info(root_cur: sqlite3.Cursor, name: str) -> tuple[int, int, str]: + res = root_cur.execute( + """ + SELECT + id, + visits_time_max, + database_path + FROM + browsers + WHERE + browsers.name = (?) + """, + (name,), + ) + browser_id, visits_time_max, database_path = res.fetchone() + return (browser_id, visits_time_max, database_path) + + +def convert_chromium_transition_type(transition_qualifier: int) -> int: + """ + Convert transition qualifier of chromium to transition type id defined in doc. + """ + match transition_qualifier % 0x100: + case x if 0 <= x <= 10: + return x + 1 + case _: + return 0 # unknown + + +def convert_firefox_transition_type(transition_type: int) -> int: + """ + Convert `visit_type` of chromium to transition type id defined in doc. + """ + match transition_type: + case x if 1 <= x <= 4: + return x + case 8: + return 5 + case 9: + return 9 + case 5: + return 12 + case 6: + return 13 + case 7: + return 14 + case _: + return 0 + + +def add_db( + root_con: sqlite3.Connection, root_cur: sqlite3.Cursor, args: argparse.Namespace +): + print("Add history to root db") + browser_id, visits_time_max, database_path = get_browser_info(root_cur, args.name) + logging.info(f"{browser_id=}, {visits_time_max=}") + + logging.info(f"Source: {database_path}") + logging.info(f"Root: {args.root_db}") + + dburi = f"file:{database_path}?mode=ro&nolock=1" + logging.info(f"DB uri: {dburi}") + con = sqlite3.connect(dburi, uri=True) + cur = con.cursor() + + db_type = get_db_type(cur) + logging.info(f"DB type: {db_type}") + + match db_type: + case "firefox": + logging.error("Not implemented") + raise RuntimeError("Not implemented") + case "chromium": + select_url_toupdate_sql = """ + SELECT + urls.id, + urls.url, + urls.title + FROM + visits, + urls + WHERE + visits.visit_time > (?) + AND visits.url = urls.id + """ + select_visit_sql = """ + SELECT + visits.id, + visits.url, + urls.url, + urls.title, + visits.visit_time, + visits.from_visit, + visits.transition + FROM + visits, + urls + WHERE + visits.visit_time > (?) + AND visits.url = urls.id + """ + convert_transition_type = convert_chromium_transition_type + convert_transition_qualifier = lambda x: x + res = cur.execute(select_url_toupdate_sql, [visits_time_max]) + updating_urls = ( + ( + browser_id, + id, + url, + title, + ) + for id, url, title in res + ) + root_cur.executemany( + """ + REPLACE INTO urls + VALUES(NULL, ?, ?, ?, ?) + """, + updating_urls, + ) + print(f"Wrote {root_cur.rowcount} urls") + root_con.commit() + print(f"Wrote {root_cur.rowcount} urls") + logging.info("updated urls in new visits") + res = cur.execute(select_visit_sql, [visits_time_max]) + new_visits = ( + ( + browser_id, + id, + url_id, + url, + title, + visit_time, + from_visit, + convert_transition_qualifier(transition), + convert_transition_type(transition), + ) + for id, url_id, url, title, visit_time, from_visit, transition in res + ) + root_cur.executemany( + """ + INSERT INTO visits + VALUES(NULL, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """, + new_visits, + ) + print(f"Wrote {root_cur.rowcount} visits") + root_con.commit() + print(f"Wrote {root_cur.rowcount} visits") + logging.info("added new visits") + + # update visits_time_max + res = root_cur.execute( + """ + SELECT + max(visits.visit_time) + FROM + visits + WHERE + visits.browser = (?) + """, + [browser_id], + ) + (new_urls_time_max,) = res.fetchone() + logging.info(f"{new_urls_time_max=}") + root_cur.execute( + """ + UPDATE + browsers + SET + visits_time_max = (?) + WHERE + browsers.id = (?) + """, + (new_urls_time_max, browser_id), + ) + root_con.commit() + logging.info("Updated browser information") + + +def main() -> int: + parser = argparse.ArgumentParser(description="Browser history merger") + parser.add_argument("root_db", help="Merged database path") + parser.add_argument( + "-v", "--verbosity", action="count", default=0, help="Increase log verbosity" + ) + subparsers = parser.add_subparsers() + parser_init = subparsers.add_parser("init", help="Initialize root db") + parser_init.add_argument("name", help="Unique name for the browser") + parser_init.add_argument("database", help="Path to the browser's history db") + parser_init.set_defaults(func=init_db) + parse_add = subparsers.add_parser("add", help="Add history to root db") + # parse_add.add_argument("db", help="Source db file") + parse_add.add_argument( + "name", help="Source browser name(which was added to root db before)" + ) + parse_add.set_defaults(func=add_db) + args = parser.parse_args() + + match args.verbosity: + case 0: + logging.basicConfig(level=logging.WARN) + case 1: + logging.basicConfig(level=logging.INFO) + case _: + logging.basicConfig(level=logging.DEBUG) + logging.debug(f"{args=}") + + root_db_path = args.root_db + root_con = sqlite3.connect(root_db_path) + root_cur = root_con.cursor() + + if not hasattr(args, "func"): + parser.print_help() + return 1 + args.func(root_con, root_cur, args) + return 0 + + +if __name__ == "__main__": + main() diff --git a/src/browser_history_merger/__main__.py b/src/browser_history_merger/__main__.py new file mode 100644 index 0000000..062bae7 --- /dev/null +++ b/src/browser_history_merger/__main__.py @@ -0,0 +1,4 @@ +import browser_history_merger +import sys + +sys.exit(browser_history_merger.main()) \ No newline at end of file