add reflink-based dedup; closes #201

This commit is contained in:
ed 2025-07-28 19:41:03 +00:00
parent 674fc1fe08
commit df9feabcf8
6 changed files with 25 additions and 3 deletions

View file

@ -1439,12 +1439,17 @@ if you enable deduplication with `--dedup` then it'll create a symlink instead o
**warning:** when enabling dedup, you should also: **warning:** when enabling dedup, you should also:
* enable indexing with `-e2dsa` or volflag `e2dsa` (see [file indexing](#file-indexing) section below); strongly recommended * enable indexing with `-e2dsa` or volflag `e2dsa` (see [file indexing](#file-indexing) section below); strongly recommended
* ...and/or `--hardlink-only` to use hardlink-based deduplication instead of symlinks; see explanation below * ...and/or `--hardlink-only` to use hardlink-based deduplication instead of symlinks; see explanation below
* ...and/or `--reflink` to use CoW/reflink-based dedup (much safer than hardlink, but OS/FS-dependent)
it will not be safe to rename/delete files if you only enable dedup and none of the above; if you enable indexing then it is not *necessary* to also do hardlinks (but you may still want to) it will not be safe to rename/delete files if you only enable dedup and none of the above; if you enable indexing then it is not *necessary* to also do hardlinks (but you may still want to)
by default, deduplication is done based on symlinks (symbolic links); these are tiny files which are pointers to the nearest full copy of the file by default, deduplication is done based on symlinks (symbolic links); these are tiny files which are pointers to the nearest full copy of the file
you can choose to use hardlinks instead of softlinks, globally with `--hardlink-only` or volflag `hardlinkonly`; you can choose to use hardlinks instead of softlinks, globally with `--hardlink-only` or volflag `hardlinkonly`, and you can choose to use reflinks with `--reflink` or volflag `reflink`
advantages of using reflinks (CoW, copy-on-write):
* entirely safe (when your filesystem supports it correctly); either file can be edited or deleted without affecting other copies
* only linux 5.3 or newer, only python 3.14 or newer, only some filesystems (btrfs probably ok, maybe xfs too, but zfs had bugs)
advantages of using hardlinks: advantages of using hardlinks:
* hardlinks are more compatible with other software; they behave entirely like regular files * hardlinks are more compatible with other software; they behave entirely like regular files

View file

@ -1056,6 +1056,7 @@ def add_upload(ap):
ap2.add_argument("--safe-dedup", metavar="N", type=int, default=50, help="how careful to be when deduplicating files; [\033[32m1\033[0m] = just verify the filesize, [\033[32m50\033[0m] = verify file contents have not been altered (volflag=safededup)") ap2.add_argument("--safe-dedup", metavar="N", type=int, default=50, help="how careful to be when deduplicating files; [\033[32m1\033[0m] = just verify the filesize, [\033[32m50\033[0m] = verify file contents have not been altered (volflag=safededup)")
ap2.add_argument("--hardlink", action="store_true", help="enable hardlink-based dedup; will fallback on symlinks when that is impossible (across filesystems) (volflag=hardlink)") ap2.add_argument("--hardlink", action="store_true", help="enable hardlink-based dedup; will fallback on symlinks when that is impossible (across filesystems) (volflag=hardlink)")
ap2.add_argument("--hardlink-only", action="store_true", help="do not fallback to symlinks when a hardlink cannot be made (volflag=hardlinkonly)") ap2.add_argument("--hardlink-only", action="store_true", help="do not fallback to symlinks when a hardlink cannot be made (volflag=hardlinkonly)")
ap2.add_argument("--reflink", action="store_true", help="enable reflink-based dedup; will fallback on full copies when that is impossible (non-CoW filesystem) (volflag=reflink)")
ap2.add_argument("--no-dupe", action="store_true", help="reject duplicate files during upload; only matches within the same volume (volflag=nodupe)") ap2.add_argument("--no-dupe", action="store_true", help="reject duplicate files during upload; only matches within the same volume (volflag=nodupe)")
ap2.add_argument("--no-clone", action="store_true", help="do not use existing data on disk to satisfy dupe uploads; reduces server HDD reads in exchange for much more network load (volflag=noclone)") ap2.add_argument("--no-clone", action="store_true", help="do not use existing data on disk to satisfy dupe uploads; reduces server HDD reads in exchange for much more network load (volflag=noclone)")
ap2.add_argument("--no-snap", action="store_true", help="disable snapshots -- forget unfinished uploads on shutdown; don't create .hist/up2k.snap files -- abandoned/interrupted uploads must be cleaned up manually") ap2.add_argument("--no-snap", action="store_true", help="disable snapshots -- forget unfinished uploads on shutdown; don't create .hist/up2k.snap files -- abandoned/interrupted uploads must be cleaned up manually")

View file

@ -2124,6 +2124,7 @@ class AuthSrv(object):
all_mte = {} all_mte = {}
errors = False errors = False
free_umask = False free_umask = False
have_reflink = False
for vol in vfs.all_nodes.values(): for vol in vfs.all_nodes.values():
if (self.args.e2ds and vol.axs.uwrite) or self.args.e2dsa: if (self.args.e2ds and vol.axs.uwrite) or self.args.e2dsa:
vol.flags["e2ds"] = True vol.flags["e2ds"] = True
@ -2207,6 +2208,9 @@ class AuthSrv(object):
if "unlistcr" in vol.flags or "unlistcw" in vol.flags: if "unlistcr" in vol.flags or "unlistcw" in vol.flags:
self.args.have_unlistc = True self.args.have_unlistc = True
if "reflink" in vol.flags:
have_reflink = True
zs = str(vol.flags.get("tcolor", "")).lstrip("#") zs = str(vol.flags.get("tcolor", "")).lstrip("#")
if len(zs) == 3: # fc5 => ffcc55 if len(zs) == 3: # fc5 => ffcc55
vol.flags["tcolor"] = "".join([x * 2 for x in zs]) vol.flags["tcolor"] = "".join([x * 2 for x in zs])
@ -2571,6 +2575,13 @@ class AuthSrv(object):
t = "WARNING! The following IdP volumes are mounted below another volume where other users can read and/or write files. This is a SECURITY HAZARD!! When copyparty is restarted, it will not know about these IdP volumes yet. These volumes will then be accessible by an unexpected set of permissions UNTIL one of the users associated with their volume sends a request to the server. RECOMMENDATION: You should create a restricted volume where nobody can read/write files, and make sure that all IdP volumes are configured to appear somewhere below that volume." t = "WARNING! The following IdP volumes are mounted below another volume where other users can read and/or write files. This is a SECURITY HAZARD!! When copyparty is restarted, it will not know about these IdP volumes yet. These volumes will then be accessible by an unexpected set of permissions UNTIL one of the users associated with their volume sends a request to the server. RECOMMENDATION: You should create a restricted volume where nobody can read/write files, and make sure that all IdP volumes are configured to appear somewhere below that volume."
self.log(t + "".join(self.idp_err), 1) self.log(t + "".join(self.idp_err), 1)
if have_reflink:
t = "WARNING: Reflink-based dedup was requested, but %s. This will not work; files will be full copies instead."
if sys.version_info < (3, 14):
self.log(t % "your python version is not new enough", 1)
if not sys.platform.startswith("linux"):
self.log(t % "your OS is not Linux", 1)
self.vfs = vfs self.vfs = vfs
self.acct = acct self.acct = acct
self.defpw = defpw self.defpw = defpw

View file

@ -52,6 +52,7 @@ def vf_bmap() -> dict[str, str]:
"og_no_head", "og_no_head",
"og_s_title", "og_s_title",
"rand", "rand",
"reflink",
"rmagic", "rmagic",
"rss", "rss",
"wo_up_readme", "wo_up_readme",
@ -168,6 +169,7 @@ flagcats = {
"dedup": "enable symlink-based file deduplication", "dedup": "enable symlink-based file deduplication",
"hardlink": "enable hardlink-based file deduplication,\nwith fallback on symlinks when that is impossible", "hardlink": "enable hardlink-based file deduplication,\nwith fallback on symlinks when that is impossible",
"hardlinkonly": "dedup with hardlink only, never symlink;\nmake a full copy if hardlink is impossible", "hardlinkonly": "dedup with hardlink only, never symlink;\nmake a full copy if hardlink is impossible",
"reflink": "enable reflink-based file deduplication,\nwith fallback on full copy when that is impossible",
"safededup": "verify on-disk data before using it for dedup", "safededup": "verify on-disk data before using it for dedup",
"noclone": "take dupe data from clients, even if available on HDD", "noclone": "take dupe data from clients, even if available on HDD",
"nodupe": "rejects existing files (instead of linking/cloning them)", "nodupe": "rejects existing files (instead of linking/cloning them)",

View file

@ -3476,6 +3476,8 @@ class Up2k(object):
linked = False linked = False
try: try:
if "reflink" in flags:
raise Exception("reflink")
if not is_mv and not flags.get("dedup"): if not is_mv and not flags.get("dedup"):
raise Exception("dedup is disabled in config") raise Exception("dedup is disabled in config")
@ -3532,6 +3534,7 @@ class Up2k(object):
linked = True linked = True
except Exception as ex: except Exception as ex:
if str(ex) != "reflink":
self.log("cannot link; creating copy: " + repr(ex)) self.log("cannot link; creating copy: " + repr(ex))
if bos.path.isfile(src): if bos.path.isfile(src):
csrc = src csrc = src

View file

@ -143,7 +143,7 @@ class Cfg(Namespace):
def __init__(self, a=None, v=None, c=None, **ka0): def __init__(self, a=None, v=None, c=None, **ka0):
ka = {} ka = {}
ex = "chpw cookie_lax daw dav_auth dav_mac dav_rt e2d e2ds e2dsa e2t e2ts e2tsr e2v e2vu e2vp early_ban ed emp exp force_js getmod grid gsel hardlink hardlink_only ih ihead magic nid nih no_acode no_athumb no_bauth no_clone no_cp no_dav no_db_ip no_del no_dirsz no_dupe no_lifetime no_logues no_mv no_pipe no_poll no_readme no_robots no_sb_md no_sb_lg no_scandir no_tail no_tarcmp no_thumb no_vthumb no_zip nrand nsort nw og og_no_head og_s_title ohead q rand re_dirsz rmagic rss smb srch_dbg srch_excl stats uqe vague_403 vc ver wo_up_readme write_uplog xdev xlink xvol zipmaxu zs" ex = "chpw cookie_lax daw dav_auth dav_mac dav_rt e2d e2ds e2dsa e2t e2ts e2tsr e2v e2vu e2vp early_ban ed emp exp force_js getmod grid gsel hardlink hardlink_only ih ihead magic nid nih no_acode no_athumb no_bauth no_clone no_cp no_dav no_db_ip no_del no_dirsz no_dupe no_lifetime no_logues no_mv no_pipe no_poll no_readme no_robots no_sb_md no_sb_lg no_scandir no_tail no_tarcmp no_thumb no_vthumb no_zip nrand nsort nw og og_no_head og_s_title ohead q rand re_dirsz reflink rmagic rss smb srch_dbg srch_excl stats uqe vague_403 vc ver wo_up_readme write_uplog xdev xlink xvol zipmaxu zs"
ka.update(**{k: False for k in ex.split()}) ka.update(**{k: False for k in ex.split()})
ex = "dav_inf dedup dotpart dotsrch hook_v no_dhash no_fastboot no_fpool no_htp no_rescan no_sendfile no_ses no_snap no_up_list no_voldump re_dhash see_dots plain_ip" ex = "dav_inf dedup dotpart dotsrch hook_v no_dhash no_fastboot no_fpool no_htp no_rescan no_sendfile no_ses no_snap no_up_list no_voldump re_dhash see_dots plain_ip"