aboutsummaryrefslogtreecommitdiffstats
diff options
authorDarrick J. Wong <djwong@kernel.org>2024-11-26 16:24:24 -0800
committerDarrick J. Wong <djwong@kernel.org>2024-11-26 16:24:24 -0800
commitf512c164a1d0e83a54eabf757be689c3f8bc54a7 (patch)
tree8c9e7d9085cee2f59ef129e259d5ad9588362c5d
parent14bbbf56779842217982894a4499f6cb212f655f (diff)
parent368784fa00f920518ac686638c163852a477937c (diff)
downloadxfs-documentation-for-next.tar.gz
Merge tag 'xfsdocs-6.13-updates_2024-11-26' of git://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-documentationv2024-11-26for-next
xfs-documentation: updates for 6.13 [1/3] Here's a pile of updates detailing the changes made during 6.12 and 6.13. Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
-rw-r--r--.gitignore1
-rw-r--r--admin/Makefile13
-rw-r--r--admin/XFS_Performance_Tuning/Makefile13
-rw-r--r--admin/XFS_Performance_Tuning/filesystem_tunables.asciidoc6
-rw-r--r--admin/XFS_Performance_Tuning/xfs_performance_tuning.asciidoc4
-rw-r--r--design/Makefile13
-rw-r--r--design/XFS_Filesystem_Structure/Makefile13
-rw-r--r--design/XFS_Filesystem_Structure/allocation_groups.asciidoc546
-rw-r--r--design/XFS_Filesystem_Structure/common_types.asciidoc4
-rw-r--r--design/XFS_Filesystem_Structure/data_extents.asciidoc14
-rw-r--r--design/XFS_Filesystem_Structure/delayed_logging.asciidoc (renamed from design/xfs-delayed-logging-design.asciidoc)4
-rw-r--r--design/XFS_Filesystem_Structure/docinfo.xml80
-rw-r--r--design/XFS_Filesystem_Structure/extended_attributes.asciidoc156
-rw-r--r--design/XFS_Filesystem_Structure/fs_properties.asciidoc28
-rw-r--r--design/XFS_Filesystem_Structure/images/31.pngbin10652 -> 0 bytes
-rw-r--r--design/XFS_Filesystem_Structure/internal_inodes.asciidoc171
-rw-r--r--design/XFS_Filesystem_Structure/journaling_log.asciidoc278
-rw-r--r--design/XFS_Filesystem_Structure/magic.asciidoc9
-rw-r--r--design/XFS_Filesystem_Structure/metadata_integrity.asciidoc36
-rw-r--r--design/XFS_Filesystem_Structure/metadump.asciidoc122
-rw-r--r--design/XFS_Filesystem_Structure/ondisk_inode.asciidoc92
-rwxr-xr-x[-rw-r--r--]design/XFS_Filesystem_Structure/overview.asciidoc2
-rw-r--r--design/XFS_Filesystem_Structure/realtime.asciidoc394
-rw-r--r--design/XFS_Filesystem_Structure/reconstruction.asciidoc17
-rw-r--r--design/XFS_Filesystem_Structure/refcountbt.asciidoc2
-rw-r--r--design/XFS_Filesystem_Structure/self_describing_metadata.asciidoc (renamed from design/xfs-self-describing-metadata.asciidoc)29
-rw-r--r--design/XFS_Filesystem_Structure/superblock.asciidoc574
-rw-r--r--design/XFS_Filesystem_Structure/timestamps.asciidoc65
-rw-r--r--design/XFS_Filesystem_Structure/xfs_filesystem_structure.asciidoc14
-rw-r--r--design/xfs-smr-structure.asciidoc14
30 files changed, 2056 insertions, 658 deletions
diff --git a/.gitignore b/.gitignore
index a2e10b4..412ff1c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
*.html
*.pdf
*.css
+*.epub
diff --git a/admin/Makefile b/admin/Makefile
index de27f3b..dcffc63 100644
--- a/admin/Makefile
+++ b/admin/Makefile
@@ -11,6 +11,7 @@ DOCFILES=$(wildcard *.asciidoc)
HTML_TARGETS=$(addsuffix .html, $(basename $(DOCFILES)))
PDF_TARGETS=$(addsuffix .pdf, $(basename $(DOCFILES)))
+EPUB_TARGETS=$(addsuffix .epub, $(basename $(DOCFILES)))
%.html: %.asciidoc
@echo "[html] $*"
@@ -20,7 +21,11 @@ PDF_TARGETS=$(addsuffix .pdf, $(basename $(DOCFILES)))
@echo "[pdf] $*"
$(Q)a2x -f pdf $<
-default: html pdf $(SUBDIRS)
+%.epub: %.asciidoc
+ @echo "[epub] $*"
+ $(Q)a2x -f epub $<
+
+default: html pdf epub $(SUBDIRS)
$(SUBDIRS):
@echo "Building $@"
@@ -30,14 +35,18 @@ html: $(HTML_TARGETS)
pdf: $(PDF_TARGETS)
+epub: $(EPUB_TARGETS)
+
# manually construct build dependencies for target builds so that modification
# of individual files will trigger a rebuild of the document correctly.
$(PDF_TARGETS): $(DOCFILES)
$(HTML_TARGETS): $(DOCFILES)
+$(EPUB_TARGETS): $(DOCFILES)
+
clean: $(addsuffix -clean, $(SUBDIRS))
- $(Q)rm -f *.html *.pdf *.css
+ $(Q)rm -f *.html *.pdf *.css *.epub
%-clean:
@echo "Cleaning $*"
diff --git a/admin/XFS_Performance_Tuning/Makefile b/admin/XFS_Performance_Tuning/Makefile
index 06451f1..2b929a4 100644
--- a/admin/XFS_Performance_Tuning/Makefile
+++ b/admin/XFS_Performance_Tuning/Makefile
@@ -8,8 +8,9 @@ DOCFILES=$(wildcard *.asciidoc) \
HTML_TARGET=$(addsuffix .html, $(TARGET))
PDF_TARGET=$(addsuffix .pdf, $(TARGET))
+EPUB_TARGET=$(addsuffix .epub, $(TARGET))
-default: html pdf
+default: html pdf epub
%.html: %.asciidoc
@echo "[html] $*"
@@ -19,16 +20,24 @@ default: html pdf
@echo "[pdf] $*"
$(Q)a2x -f pdf -d book $<
+%.epub: %.asciidoc
+ @echo "[epub] $*"
+ $(Q)a2x -f epub -d book $<
+
html: $(HTML_TARGET)
pdf: $(PDF_TARGET)
+epub: $(EPUB_TARGET)
+
# manually construct build dependencies for target builds so that modification
# of individual files will trigger a rebuild of the document correctly.
$(PDF_TARGET): $(DOCFILES)
$(HTML_TARGET): $(DOCFILES)
+$(EPUB_TARGET): $(DOCFILES)
+
clean:
- $(Q)rm -f *.html *.pdf *.css
+ $(Q)rm -f *.html *.pdf *.css *.epub
diff --git a/admin/XFS_Performance_Tuning/filesystem_tunables.asciidoc b/admin/XFS_Performance_Tuning/filesystem_tunables.asciidoc
index c12981b..c570406 100644
--- a/admin/XFS_Performance_Tuning/filesystem_tunables.asciidoc
+++ b/admin/XFS_Performance_Tuning/filesystem_tunables.asciidoc
@@ -35,7 +35,7 @@ units as used on the +mkfs.xfs+ command line to configure these parameters.
The performance examples given in this section are highly dependent on storage,
CPU and RAM configuration. They are intended as guidelines to illustrate
behavioural differences, not the exact performance any configuration will
-acheive.
+achieve.
=====
=== Directory block size
@@ -238,7 +238,7 @@ available for storing attributes.
When attributes are stored in the literal area of the inode, both attribute
names and attribute values are limited to a maximum size of 254 bytes. If either
name or value exceeds 254 bytes in length, or the total space used by the
-atributes exceeds the size of the literal area, the entire set of attributes
+attributes exceeds the size of the literal area, the entire set of attributes
stored on the inode are pushed to a separate attribute block instead of being
stored inline.
@@ -359,7 +359,7 @@ than the maximum, and hence there is no need to reduce the log buffer size for
fsync heavy workloads.
The default size of the log buffer is 32KB. The maximum size is 256KB and other
-supported sizes are 64KB, 128KB or power of 2 mulitples of the log stripe unit
+supported sizes are 64KB, 128KB or power of 2 multiples of the log stripe unit
between 32KB and 256KB. It can be configured by use of the +logbsize+ mount
option.
diff --git a/admin/XFS_Performance_Tuning/xfs_performance_tuning.asciidoc b/admin/XFS_Performance_Tuning/xfs_performance_tuning.asciidoc
index 0310bbd..b249e35 100644
--- a/admin/XFS_Performance_Tuning/xfs_performance_tuning.asciidoc
+++ b/admin/XFS_Performance_Tuning/xfs_performance_tuning.asciidoc
@@ -42,8 +42,8 @@ xref:Knowledge[Knowledge Section].
The xref:Process[Process section] will cover the typical processes used to
optimise a filesystem for a given workload. If the workload measurements are not
-accurate or reproducable, then no conclusions can be drawn as to whether a
-configuration changes an improvemnt or not. Hence without a robust testing
+accurate or reproducible, then no conclusions can be drawn as to whether a
+configuration changes an improvement or not. Hence without a robust testing
process, no amount of knowledge or observation will result in a well optimised
filesystem configuration.
diff --git a/design/Makefile b/design/Makefile
index 0879470..0847896 100644
--- a/design/Makefile
+++ b/design/Makefile
@@ -11,6 +11,7 @@ DOCFILES=$(wildcard *.asciidoc)
HTML_TARGETS=$(addsuffix .html, $(basename $(DOCFILES)))
PDF_TARGETS=$(addsuffix .pdf, $(basename $(DOCFILES)))
+EPUB_TARGETS=$(addsuffix .epub, $(basename $(DOCFILES)))
%.html: %.asciidoc
@echo "[html] $*"
@@ -20,7 +21,11 @@ PDF_TARGETS=$(addsuffix .pdf, $(basename $(DOCFILES)))
@echo "[pdf] $*"
$(Q)a2x -f pdf --dblatex-opts "-P latex.output.revhistory=0" $<
-default: html pdf $(SUBDIRS)
+%.epub: %.asciidoc
+ @echo "[epub] $*"
+ $(Q)a2x -f epub $<
+
+default: html pdf epub $(SUBDIRS)
$(SUBDIRS):
@echo "Building $@"
@@ -30,14 +35,18 @@ html: $(HTML_TARGETS)
pdf: $(PDF_TARGETS)
+epub: $(EPUB_TARGETS)
+
# manually construct build dependencies for target builds so that modification
# of individual files will trigger a rebuild of the document correctly.
$(PDF_TARGETS): $(DOCFILES)
$(HTML_TARGETS): $(DOCFILES)
+$(EPUB_TARGETS): $(DOCFILES)
+
clean: $(addsuffix -clean, $(SUBDIRS))
- $(Q)rm -f *.html *.pdf *.css
+ $(Q)rm -f *.html *.pdf *.css *.epub
%-clean:
@echo "Cleaning $*"
diff --git a/design/XFS_Filesystem_Structure/Makefile b/design/XFS_Filesystem_Structure/Makefile
index 359dd98..be78a75 100644
--- a/design/XFS_Filesystem_Structure/Makefile
+++ b/design/XFS_Filesystem_Structure/Makefile
@@ -8,8 +8,9 @@ DOCFILES=$(wildcard *.asciidoc) \
HTML_TARGET=$(addsuffix .html, $(TARGET))
PDF_TARGET=$(addsuffix .pdf, $(TARGET))
+EPUB_TARGET=$(addsuffix .epub, $(TARGET))
-default: html pdf
+default: html pdf epub
%.html: %.asciidoc
@echo "[html] $*"
@@ -19,16 +20,24 @@ default: html pdf
@echo "[pdf] $*"
$(Q)a2x -f pdf -d book $<
+%.epub: %.asciidoc
+ @echo "[epub] $*"
+ $(Q)a2x -f epub -d book $<
+
html: $(HTML_TARGET)
pdf: $(PDF_TARGET)
+epub: $(EPUB_TARGET)
+
# manually construct build dependencies for target builds so that modification
# of individual files will trigger a rebuild of the document correctly.
$(PDF_TARGET): $(DOCFILES)
$(HTML_TARGET): $(DOCFILES)
+$(EPUB_TARGET): $(DOCFILES)
+
clean:
- $(Q)rm -f *.html *.pdf *.css
+ $(Q)rm -f *.html *.pdf *.css *.epub
diff --git a/design/XFS_Filesystem_Structure/allocation_groups.asciidoc b/design/XFS_Filesystem_Structure/allocation_groups.asciidoc
index 992615d..c746a92 100644
--- a/design/XFS_Filesystem_Structure/allocation_groups.asciidoc
+++ b/design/XFS_Filesystem_Structure/allocation_groups.asciidoc
@@ -31,511 +31,7 @@ image::images/6.png[]
Each of these structures are expanded upon in the following sections.
-[[Superblocks]]
-== Superblocks
-
-Each AG starts with a superblock. The first one, in AG 0, is the primary
-superblock which stores aggregate AG information. Secondary superblocks are
-only used by xfs_repair when the primary superblock has been corrupted. A
-superblock is one sector in length.
-
-The superblock is defined by the following structure. The description of each
-field follows.
-
-[source, c]
-----
-struct xfs_sb
-{
- __uint32_t sb_magicnum;
- __uint32_t sb_blocksize;
- xfs_rfsblock_t sb_dblocks;
- xfs_rfsblock_t sb_rblocks;
- xfs_rtblock_t sb_rextents;
- uuid_t sb_uuid;
- xfs_fsblock_t sb_logstart;
- xfs_ino_t sb_rootino;
- xfs_ino_t sb_rbmino;
- xfs_ino_t sb_rsumino;
- xfs_agblock_t sb_rextsize;
- xfs_agblock_t sb_agblocks;
- xfs_agnumber_t sb_agcount;
- xfs_extlen_t sb_rbmblocks;
- xfs_extlen_t sb_logblocks;
- __uint16_t sb_versionnum;
- __uint16_t sb_sectsize;
- __uint16_t sb_inodesize;
- __uint16_t sb_inopblock;
- char sb_fname[12];
- __uint8_t sb_blocklog;
- __uint8_t sb_sectlog;
- __uint8_t sb_inodelog;
- __uint8_t sb_inopblog;
- __uint8_t sb_agblklog;
- __uint8_t sb_rextslog;
- __uint8_t sb_inprogress;
- __uint8_t sb_imax_pct;
- __uint64_t sb_icount;
- __uint64_t sb_ifree;
- __uint64_t sb_fdblocks;
- __uint64_t sb_frextents;
- xfs_ino_t sb_uquotino;
- xfs_ino_t sb_gquotino;
- __uint16_t sb_qflags;
- __uint8_t sb_flags;
- __uint8_t sb_shared_vn;
- xfs_extlen_t sb_inoalignmt;
- __uint32_t sb_unit;
- __uint32_t sb_width;
- __uint8_t sb_dirblklog;
- __uint8_t sb_logsectlog;
- __uint16_t sb_logsectsize;
- __uint32_t sb_logsunit;
- __uint32_t sb_features2;
- __uint32_t sb_bad_features2;
-
- /* version 5 superblock fields start here */
- __uint32_t sb_features_compat;
- __uint32_t sb_features_ro_compat;
- __uint32_t sb_features_incompat;
- __uint32_t sb_features_log_incompat;
-
- __uint32_t sb_crc;
- xfs_extlen_t sb_spino_align;
-
- xfs_ino_t sb_pquotino;
- xfs_lsn_t sb_lsn;
- uuid_t sb_meta_uuid;
- xfs_ino_t sb_rrmapino;
-};
-----
-*sb_magicnum*::
-Identifies the filesystem. Its value is +XFS_SB_MAGIC+ ``XFSB'' (0x58465342).
-
-*sb_blocksize*::
-The size of a basic unit of space allocation in bytes. Typically, this is 4096
-(4KB) but can range from 512 to 65536 bytes.
-
-*sb_dblocks*::
-Total number of blocks available for data and metadata on the filesystem.
-
-*sb_rblocks*::
-Number blocks in the real-time disk device. Refer to
-xref:Real-time_Devices[real-time sub-volumes] for more information.
-
-*sb_rextents*::
-Number of extents on the real-time device.
-
-*sb_uuid*::
-UUID (Universally Unique ID) for the filesystem. Filesystems can be mounted by
-the UUID instead of device name.
-
-*sb_logstart*::
-First block number for the journaling log if the log is internal (ie. not on a
-separate disk device). For an external log device, this will be zero (the log
-will also start on the first block on the log device). The identity of the log
-devices is not recorded in the filesystem, but the UUIDs of the filesystem and
-the log device are compared to prevent corruption.
-
-*sb_rootino*::
-Root inode number for the filesystem. Normally, the root inode is at the
-start of the first possible inode chunk in AG 0. This is 128 when using a 4KB
-block size.
-
-*sb_rbmino*::
-Bitmap inode for real-time extents.
-
-*sb_rsumino*::
-Summary inode for real-time bitmap.
-
-*sb_rextsize*::
-Realtime extent size in blocks.
-
-*sb_agblocks*::
-Size of each AG in blocks. For the actual size of the last AG, refer to the
-xref:AG_Free_Space_Management[free space] +agf_length+ value.
-
-*sb_agcount*::
-Number of AGs in the filesystem.
-
-*sb_rbmblocks*::
-Number of real-time bitmap blocks.
-
-*sb_logblocks*::
-Number of blocks for the journaling log.
-
-*sb_versionnum*::
-Filesystem version number. This is a bitmask specifying the features enabled
-when creating the filesystem. Any disk checking tools or drivers that do not
-recognize any set bits must not operate upon the filesystem. Most of the flags
-indicate features introduced over time. If the value of the lower nibble is >=
-4, the higher bits indicate feature flags as follows:
-
-.Version 4 Superblock version flags
-[options="header"]
-|=====
-| Flag | Description
-| +XFS_SB_VERSION_ATTRBIT+ |
-Set if any inode have extended attributes. If this bit is set; the
-+XFS_SB_VERSION2_ATTR2BIT+ is not set; and the +attr2+ mount flag is not
-specified, the +di_forkoff+ inode field will not be dynamically adjusted.
-See the section about xref:Extended_Attribute_Versions[extended attribute
-versions] for more information.
-
-| +XFS_SB_VERSION_NLINKBIT+ | Set if any inodes use 32-bit di_nlink values.
-| +XFS_SB_VERSION_QUOTABIT+ |
-Quotas are enabled on the filesystem. This
-also brings in the various quota fields in the superblock.
-
-| +XFS_SB_VERSION_ALIGNBIT+ | Set if sb_inoalignmt is used.
-| +XFS_SB_VERSION_DALIGNBIT+ | Set if sb_unit and sb_width are used.
-| +XFS_SB_VERSION_SHAREDBIT+ | Set if sb_shared_vn is used.
-| +XFS_SB_VERSION_LOGV2BIT+ | Version 2 journaling logs are used.
-| +XFS_SB_VERSION_SECTORBIT+ | Set if sb_sectsize is not 512.
-| +XFS_SB_VERSION_EXTFLGBIT+ | Unwritten extents are used. This is always set.
-| +XFS_SB_VERSION_DIRV2BIT+ |
-Version 2 directories are used. This is always set.
-
-| +XFS_SB_VERSION_MOREBITSBIT+ |
-Set if the sb_features2 field in the superblock contains more flags.
-|=====
-
-If the lower nibble of this value is 5, then this is a v5 filesystem; the
-+XFS_SB_VERSION2_CRCBIT+ feature must be set in +sb_features2+.
-
-*sb_sectsize*::
-Specifies the underlying disk sector size in bytes. Typically this is 512 or
-4096 bytes. This determines the minimum I/O alignment, especially for direct I/O.
-
-*sb_inodesize*::
-Size of the inode in bytes. The default is 256 (2 inodes per standard sector)
-but can be made as large as 2048 bytes when creating the filesystem. On a v5
-filesystem, the default and minimum inode size are both 512 bytes.
-
-*sb_inopblock*::
-Number of inodes per block. This is equivalent to +sb_blocksize / sb_inodesize+.
-
-*sb_fname[12]*::
-Name for the filesystem. This value can be used in the mount command.
-
-*sb_blocklog*::
-log~2~ value of +sb_blocksize+. In other terms, +sb_blocksize = 2^sb_blocklog^+.
-
-*sb_sectlog*::
-log~2~ value of +sb_sectsize+.
-
-*sb_inodelog*::
-log~2~ value of +sb_inodesize+.
-
-*sb_inopblog*::
-log~2~ value of +sb_inopblock+.
-
-*sb_agblklog*::
-log~2~ value of +sb_agblocks+ (rounded up). This value is used to generate inode
-numbers and absolute block numbers defined in extent maps.
-
-*sb_rextslog*::
-log~2~ value of +sb_rextents+.
-
-*sb_inprogress*::
-Flag specifying that the filesystem is being created.
-
-*sb_imax_pct*::
-Maximum percentage of filesystem space that can be used for inodes. The default
-value is 5%.
-
-*sb_icount*::
-Global count for number inodes allocated on the filesystem. This is only
-maintained in the first superblock.
-
-*sb_ifree*::
-Global count of free inodes on the filesystem. This is only maintained in the
-first superblock.
-
-*sb_fdblocks*::
-Global count of free data blocks on the filesystem. This is only maintained in
-the first superblock.
-
-*sb_frextents*::
-Global count of free real-time extents on the filesystem. This is only
-maintained in the first superblock.
-
-*sb_uquotino*::
-Inode for user quotas. This and the following two quota fields only apply if
-+XFS_SB_VERSION_QUOTABIT+ flag is set in +sb_versionnum+. Refer to
-xref:Quota_Inodes[quota inodes] for more information
-
-*sb_gquotino*::
-Inode for group or project quotas. Group and Project quotas cannot be used at
-the same time.
-
-*sb_qflags*::
-Quota flags. It can be a combination of the following flags:
-
-.Superblock quota flags
-[options="header"]
-|=====
-| Flag | Description
-| +XFS_UQUOTA_ACCT+ | User quota accounting is enabled.
-| +XFS_UQUOTA_ENFD+ | User quotas are enforced.
-| +XFS_UQUOTA_CHKD+ | User quotas have been checked.
-| +XFS_PQUOTA_ACCT+ | Project quota accounting is enabled.
-| +XFS_OQUOTA_ENFD+ | Other (group/project) quotas are enforced.
-| +XFS_OQUOTA_CHKD+ | Other (group/project) quotas have been checked.
-| +XFS_GQUOTA_ACCT+ | Group quota accounting is enabled.
-| +XFS_GQUOTA_ENFD+ | Group quotas are enforced.
-| +XFS_GQUOTA_CHKD+ | Group quotas have been checked.
-| +XFS_PQUOTA_ENFD+ | Project quotas are enforced.
-| +XFS_PQUOTA_CHKD+ | Project quotas have been checked.
-|=====
-
-*sb_flags*::
-Miscellaneous flags.
-
-.Superblock flags
-[options="header"]
-|=====
-| Flag | Description
-| +XFS_SBF_READONLY+ | Only read-only mounts allowed.
-|=====
-
-*sb_shared_vn*::
-Reserved and must be zero (``vn'' stands for version number).
-
-*sb_inoalignmt*::
-Inode chunk alignment in fsblocks. Prior to v5, the default value provided for
-inode chunks to have an 8KiB alignment. Starting with v5, the default value
-scales with the multiple of the inode size over 256 bytes. Concretely, this
-means an alignment of 16KiB for 512-byte inodes, 32KiB for 1024-byte inodes,
-etc. If sparse inodes are enabled, the +ir_startino+ field of each inode
-B+tree record must be aligned to this block granularity, even if the inode
-given by +ir_startino+ itself is sparse.
-
-*sb_unit*::
-Underlying stripe or raid unit in blocks.
-
-*sb_width*::
-Underlying stripe or raid width in blocks.
-
-*sb_dirblklog*::
-log~2~ multiplier that determines the granularity of directory block allocations
-in fsblocks.
-
-*sb_logsectlog*::
-log~2~ value of the log subvolume's sector size. This is only used if the
-journaling log is on a separate disk device (i.e. not internal).
-
-*sb_logsectsize*::
-The log's sector size in bytes if the filesystem uses an external log device.
-
-*sb_logsunit*::
-The log device's stripe or raid unit size. This only applies to version 2 logs
-+XFS_SB_VERSION_LOGV2BIT+ is set in +sb_versionnum+.
-
-*sb_features2*::
-Additional version flags if +XFS_SB_VERSION_MOREBITSBIT+ is set in
-+sb_versionnum+. The currently defined additional features include:
-
-.Extended Version 4 Superblock flags
-[options="header"]
-|=====
-| Flag | Description
-| +XFS_SB_VERSION2_LAZYSBCOUNTBIT+ |
-Lazy global counters. Making a filesystem with this bit set can improve
-performance. The global free space and inode counts are only updated in the
-primary superblock when the filesystem is cleanly unmounted.
-
-| +XFS_SB_VERSION2_ATTR2BIT+ |
-Extended attributes version 2. Making a filesystem with this optimises the
-inode layout of extended attributes. If this bit is set and the +noattr2+
-mount flag is not specified, the +di_forkoff+ inode field will be dynamically
-adjusted. See the section about xref:Extended_Attribute_Versions[extended
-attribute versions] for more information.
-
-| +XFS_SB_VERSION2_PARENTBIT+ |
-Parent pointers. All inodes must have an extended attribute that points back to
-its parent inode. The primary purpose for this information is in backup systems.
-
-| +XFS_SB_VERSION2_PROJID32BIT+ |
-32-bit Project ID. Inodes can be associated with a project ID number, which
-can be used to enforce disk space usage quotas for a particular group of
-directories. This flag indicates that project IDs can be 32 bits in size.
-
-| +XFS_SB_VERSION2_CRCBIT+ |
-Metadata checksumming. All metadata blocks have an extended header containing
-the block checksum, a copy of the metadata UUID, the log sequence number of the
-last update to prevent stale replays, and a back pointer to the owner of the
-block. This feature must be and can only be set if the lowest nibble of
-+sb_versionnum+ is set to 5.
-
-| +XFS_SB_VERSION2_FTYPE+ |
-Directory file type. Each directory entry records the type of the inode to
-which the entry points. This speeds up directory iteration by removing the
-need to load every inode into memory.
-|=====
-
-*sb_bad_features2*::
-This field mirrors +sb_features2+, due to past 64-bit alignment errors.
-
-*sb_features_compat*::
-Read-write compatible feature flags. The kernel can still read and write this
-FS even if it doesn't understand the flag. Currently, there are no valid
-flags.
-
-*sb_features_ro_compat*::
-Read-only compatible feature flags. The kernel can still read this FS even if
-it doesn't understand the flag.
-
-.Extended Version 5 Superblock Read-Only compatibility flags
-[options="header"]
-|=====
-| Flag | Description
-| +XFS_SB_FEAT_RO_COMPAT_FINOBT+ |
-Free inode B+tree. Each allocation group contains a B+tree to track inode chunks
-containing free inodes. This is a performance optimization to reduce the time
-required to allocate inodes.
-
-| +XFS_SB_FEAT_RO_COMPAT_RMAPBT+ |
-Reverse mapping B+tree. Each allocation group contains a B+tree containing
-records mapping AG blocks to their owners. See the section about
-xref:Reconstruction[reconstruction] for more details.
-
-| +XFS_SB_FEAT_RO_COMPAT_REFLINK+ |
-Reference count B+tree. Each allocation group contains a B+tree to track the
-reference counts of AG blocks. This enables files to share data blocks safely.
-See the section about xref:Reflink_Deduplication[reflink and deduplication] for
-more details.
-
-|=====
-
-*sb_features_incompat*::
-Read-write incompatible feature flags. The kernel cannot read or write this
-FS if it doesn't understand the flag.
-
-.Extended Version 5 Superblock Read-Write incompatibility flags
-[options="header"]
-|=====
-| Flag | Description
-| +XFS_SB_FEAT_INCOMPAT_FTYPE+ |
-Directory file type. Each directory entry tracks the type of the inode to
-which the entry points. This is a performance optimization to remove the need
-to load every inode into memory to iterate a directory.
-
-| +XFS_SB_FEAT_INCOMPAT_SPINODES+ |
-Sparse inodes. This feature relaxes the requirement to allocate inodes in
-chunks of 64. When the free space is heavily fragmented, there might exist
-plenty of free space but not enough contiguous free space to allocate a new
-inode chunk. With this feature, the user can continue to create files until
-all free space is exhausted.
-
-Unused space in the inode B+tree records are used to track which parts of the
-inode chunk are not inodes.
-
-See the chapter on xref:Sparse_Inodes[Sparse Inodes] for more information.
-
-| +XFS_SB_FEAT_INCOMPAT_META_UUID+ |
-Metadata UUID. The UUID stamped into each metadata block must match the value
-in +sb_meta_uuid+. This enables the administrator to change +sb_uuid+ at will
-without having to rewrite the entire filesystem.
-|=====
-
-*sb_features_log_incompat*::
-Read-write incompatible feature flags for the log. The kernel cannot read or
-write this FS log if it doesn't understand the flag. Currently, no flags are
-defined.
-
-*sb_crc*::
-Superblock checksum.
-
-*sb_spino_align*::
-Sparse inode alignment, in fsblocks. Each chunk of inodes referenced by a
-sparse inode B+tree record must be aligned to this block granularity.
-
-*sb_pquotino*::
-Project quota inode.
-
-*sb_lsn*::
-Log sequence number of the last superblock update.
-
-*sb_meta_uuid*::
-If the +XFS_SB_FEAT_INCOMPAT_META_UUID+ feature is set, then the UUID field in
-all metadata blocks must match this UUID. If not, the block header UUID field
-must match +sb_uuid+.
-
-*sb_rrmapino*::
-If the +XFS_SB_FEAT_RO_COMPAT_RMAPBT+ feature is set and a real-time
-device is present (+sb_rblocks+ > 0), this field points to an inode
-that contains the root to the
-xref:Real_time_Reverse_Mapping_Btree[Real-Time Reverse Mapping B+tree].
-This field is zero otherwise.
-
-=== xfs_db Superblock Example
-
-A filesystem is made on a single disk with the following command:
-
-----
-# mkfs.xfs -i attr=2 -n size=16384 -f /dev/sda7
-meta-data=/dev/sda7 isize=256 agcount=16, agsize=3923122 blks
- = sectsz=512 attr=2
-data = bsize=4096 blocks=62769952, imaxpct=25
- = sunit=0 swidth=0 blks, unwritten=1
-naming =version 2 bsize=16384
-log =internal log bsize=4096 blocks=30649, version=1
- = sectsz=512 sunit=0 blks
-realtime =none extsz=65536 blocks=0, rtextents=0
-----
-
-And in xfs_db, inspecting the superblock:
-
-----
-xfs_db> sb
-xfs_db> p
-magicnum = 0x58465342
-blocksize = 4096
-dblocks = 62769952
-rblocks = 0
-rextents = 0
-uuid = 32b24036-6931-45b4-b68c-cd5e7d9a1ca5
-logstart = 33554436
-rootino = 128
-rbmino = 129
-rsumino = 130
-rextsize = 16
-agblocks = 3923122
-agcount = 16
-rbmblocks = 0
-logblocks = 30649
-versionnum = 0xb084
-sectsize = 512
-inodesize = 256
-inopblock = 16
-fname = "\000\000\000\000\000\000\000\000\000\000\000\000"
-blocklog = 12
-sectlog = 9
-inodelog = 8
-inopblog = 4
-agblklog = 22
-rextslog = 0
-inprogress = 0
-imax_pct = 25
-icount = 64
-ifree = 61
-fdblocks = 62739235
-frextents = 0
-uquotino = 0
-gquotino = 0
-qflags = 0
-flags = 0
-shared_vn = 0
-inoalignmt = 2
-unit = 0
-width = 0
-dirblklog = 2
-logsectlog = 0
-logsectsize = 0
-logsunit = 0
-features2 = 8
-----
-
+include::superblock.asciidoc[]
[[AG_Free_Space_Management]]
== AG Free Space Management
@@ -928,6 +424,10 @@ struct xfs_agi {
__be32 agi_free_root;
__be32 agi_free_level;
+
+ __be32 agi_iblocks;
+ __be32 agi_fblocks;
+
}
----
*agi_magicnum*::
@@ -984,6 +484,16 @@ B+tree.
*agi_free_level*::
Specifies the number of levels in the free inode B+tree.
+*agi_iblocks*::
+The number of blocks in the inode B+tree, including the root.
+This field is zero if the +XFS_SB_FEAT_RO_COMPAT_INOBTCNT+ feature is not
+enabled.
+
+*agi_fblocks*::
+The number of blocks in the free inode B+tree, including the root.
+This field is zero if the +XFS_SB_FEAT_RO_COMPAT_INOBTCNT+ feature is not
+enabled.
+
[[Inode_Btrees]]
== Inode B+trees
@@ -1046,7 +556,7 @@ And a 2-level inode B+tree:
image::images/20b.png[]
-==== xfs_db AGI Example
+=== xfs_db AGI Example
This is an AGI of a freshly populated filesystem:
@@ -1099,7 +609,7 @@ recs[1-85] = [startino,freecount,free]
Most of the inode chunks on this filesystem are totally full, since the +free+
value is zero. This means that we ought to expect inode 160 to be linked
somewhere in the directory structure. However, notice that 0xff80000000000000
-in record 85 -- this means that we would expect inode 5856 to be free. Moving
+in record 85 -- this means that we would expect inode 5847 to be free. Moving
on to the free inode B+tree, we see that this is indeed the case:
----
@@ -1175,7 +685,7 @@ Number of free inodes in this chunk.
A 64 element bitmap showing which inodes in this chunk are not available for
allocation.
-==== xfs_db Sparse Inode AGI Example
+=== xfs_db Sparse Inode AGI Example
This example derives from an AG that has been deliberately fragmented. The
inode B+tree:
@@ -1262,23 +772,3 @@ core.magic = 0x494e
The chunk record also indicates that this chunk has 32 inodes, and that the
missing inodes are also ``free''.
-
-[[Real-time_Devices]]
-== Real-time Devices
-
-The performance of the standard XFS allocator varies depending on the internal
-state of the various metadata indices enabled on the filesystem. For
-applications which need to minimize the jitter of allocation latency, XFS
-supports the notion of a ``real-time device''. This is a special device
-separate from the regular filesystem where extent allocations are tracked with
-a bitmap and free space is indexed with a two-dimensional array. If an inode
-is flagged with +XFS_DIFLAG_REALTIME+, its data will live on the real time
-device. The metadata for real time devices is discussed in the section about
-xref:Real-time_Inodes[real time inodes].
-
-By placing the real time device (and the journal) on separate high-performance
-storage devices, it is possible to reduce most of the unpredictability in I/O
-response times that come from metadata operations.
-
-None of the XFS per-AG B+trees are involved with real time files. It is not
-possible for real time files to share data blocks.
diff --git a/design/XFS_Filesystem_Structure/common_types.asciidoc b/design/XFS_Filesystem_Structure/common_types.asciidoc
index 51909be..34cdfda 100644
--- a/design/XFS_Filesystem_Structure/common_types.asciidoc
+++ b/design/XFS_Filesystem_Structure/common_types.asciidoc
@@ -43,7 +43,9 @@ Unsigned 64 bit raw filesystem block number.
*xfs_rtblock_t*::
Unsigned 64 bit extent number in the xref:Real-time_Devices[real-time]
-sub-volume.
+sub-volume. If the +XFS_SB_FEAT_INCOMPAT_METADIR+ feature is enabled, these
+values combine an xref:Realtime_Groups[rtgroup number] and block offset into
+the realtime group.
*xfs_fileoff_t*::
Unsigned 64 bit block offset into a file.
diff --git a/design/XFS_Filesystem_Structure/data_extents.asciidoc b/design/XFS_Filesystem_Structure/data_extents.asciidoc
index 4f1109b..29c78fe 100644
--- a/design/XFS_Filesystem_Structure/data_extents.asciidoc
+++ b/design/XFS_Filesystem_Structure/data_extents.asciidoc
@@ -17,7 +17,19 @@ vary depending on the extent allocator used in the XFS driver.
An extent is 128 bits in size and uses the following packed layout:
.Extent record format
-image::images/31.png[]
+
+[cols="1,1,1,1"]
+|===
+|bit[127]
+|bits[73-126]
+|bits[21-72]
+|bits[0-20]
+
+|flag
+|logical file block offset
+|absolute block number
+|# of blocks
+|===
The extent is represented by the +xfs_bmbt_rec+ structure which uses a big
endian format on-disk. In-core management of extents use the +xfs_bmbt_irec+
diff --git a/design/xfs-delayed-logging-design.asciidoc b/design/XFS_Filesystem_Structure/delayed_logging.asciidoc
index e54e786..e9a336f 100644
--- a/design/xfs-delayed-logging-design.asciidoc
+++ b/design/XFS_Filesystem_Structure/delayed_logging.asciidoc
@@ -1,6 +1,4 @@
-= XFS Delayed Logging Design
-Dave Chinner, <dchinner@redhat.com>
-v1.0, Feb 2014: Initial conversion to asciidoc
+= Delayed Logging
== Introduction to Re-logging in XFS
diff --git a/design/XFS_Filesystem_Structure/docinfo.xml b/design/XFS_Filesystem_Structure/docinfo.xml
index e13d705..3aadb66 100644
--- a/design/XFS_Filesystem_Structure/docinfo.xml
+++ b/design/XFS_Filesystem_Structure/docinfo.xml
@@ -169,4 +169,84 @@
</simplelist>
</revdescription>
</revision>
+ <revision>
+ <revnumber>3.141592</revnumber>
+ <date>May 2018</date>
+ <author>
+ <firstname>Darrick</firstname>
+ <surname>Wong</surname>
+ <email>darrick.wong@oracle.com</email>
+ </author>
+ <revdescription>
+ <simplelist>
+ <member>Incorporate Dave Chinner's log design document.</member>
+ <member>Incorporate Dave Chinner's self-describing metadata design document.</member>
+ </simplelist>
+ </revdescription>
+ </revision>
+ <revision>
+ <revnumber>3.1415926</revnumber>
+ <date>April 2021</date>
+ <author>
+ <firstname>Darrick</firstname>
+ <surname>Wong</surname>
+ <email>djwong@kernel.org</email>
+ </author>
+ <revdescription>
+ <simplelist>
+ <member>Document the needsrepair, bigtime, and inobtcount features.</member>
+ </simplelist>
+ </revdescription>
+ </revision>
+ <revision>
+ <revnumber>3.14159265</revnumber>
+ <date>February 2023</date>
+ <author>
+ <firstname>Darrick</firstname>
+ <surname>Wong</surname>
+ <email>djwong@kernel.org</email>
+ </author>
+ <revdescription>
+ <simplelist>
+ <member>Add epub output.</member>
+ <member>large extent counts</member>
+ <member>logged extended attribute updates</member>
+ </simplelist>
+ </revdescription>
+ </revision>
+ <revision>
+ <revnumber>3.141592653</revnumber>
+ <date>August 2024</date>
+ <author>
+ <firstname>Darrick</firstname>
+ <surname>Wong</surname>
+ <email>djwong@kernel.org</email>
+ </author>
+ <revdescription>
+ <simplelist>
+ <member>metadump v2</member>
+ <member>exchange range log items</member>
+ <member>parent pointers</member>
+ </simplelist>
+ </revdescription>
+ </revision>
+ <revision>
+ <revnumber>3.1415926535</revnumber>
+ <date>November 2024</date>
+ <author>
+ <firstname>Darrick</firstname>
+ <surname>Wong</surname>
+ <email>djwong@kernel.org</email>
+ </author>
+ <revdescription>
+ <simplelist>
+ <member>update online fsck docs</member>
+ <member>filesystem properties</member>
+ <member>metadata directory tree</member>
+ <member>realtime groups</member>
+ <member>metadir and quota </member>
+ <member>realtime sb metadump</member>
+ </simplelist>
+ </revdescription>
+ </revision>
</revhistory>
diff --git a/design/XFS_Filesystem_Structure/extended_attributes.asciidoc b/design/XFS_Filesystem_Structure/extended_attributes.asciidoc
index 7df2d3d..4000c00 100644
--- a/design/XFS_Filesystem_Structure/extended_attributes.asciidoc
+++ b/design/XFS_Filesystem_Structure/extended_attributes.asciidoc
@@ -90,6 +90,7 @@ A combination of the following:
| +XFS_ATTR_SECURE+ | The attribute's namespace is ``secure''.
| +XFS_ATTR_INCOMPLETE+ | This attribute is being modified.
| +XFS_ATTR_LOCAL+ | The attribute value is contained within this block.
+| +XFS_ATTR_PARENT+ | This attribute is a parent pointer.
|=====
.Short form attribute layout
@@ -460,7 +461,7 @@ size of these entries is determined dynamically.
A variable-length array of descriptors of remote attributes. The location and
size of these entries is determined dynamically.
-On a v5 filesystem, the header becomes +xfs_da3_blkinfo_t+ to accomodate the
+On a v5 filesystem, the header becomes +xfs_da3_blkinfo_t+ to accommodate the
extra metadata integrity fields:
[source, c]
@@ -910,3 +911,156 @@ Log sequence number of the last write to this block.
Filesystems formatted prior to v5 do not have this header in the remote block.
Value data begins immediately at offset zero.
+
+[[Parent_Pointers]]
+== Directory Parent Pointers
+
+If this feature is enabled, each directory entry pointing from a parent
+directory to a child file has a corresponding back link from the child file
+back to the parent. In other words, if directory P has an entry "foo" pointing
+to child C, then child C will have a parent pointer entry "foo" pointing to
+parent P. This redundancy enables validation and repairs of the directory tree
+if the tree structure is damaged.
+
+Parent pointers are stored in the private ATTR_PARENT namespace within the
+extended attribute structure. Attribute names in this namespace use a custom
+hash function, which is defined as the dirent name hash of the dirent name XORd
+with the upper and lower 32 bits of the parent inumber. This hash function
+reduces collisions if the same file is hard linked into multiple directories
+under identical names.
+
+The attribute name contains the dirent name in
+the parent, and the attribute value contains a file handle to the parent
+directory:
+
+[source, c]
+----
+struct xfs_parent_rec {
+ __be64 p_ino;
+ __be32 p_gen;
+};
+----
+
+*p_ino*::
+Inode number of the parent directory.
+
+*p_gen*::
+Generation number of the parent directory.
+
+=== xfs_db Parent Pointer Example
+
+Create a directory tree with the following structure, assuming that the
+XFS filesystem is mounted on +/mnt+:
+
+----
+$ mkdir /mnt/a/ /mnt/b
+$ touch /mnt/a/autoexec.bat
+$ ln /mnt/a/autoexec.bat /mnt/b/config.sys
+----
+
+Now we open this up in the debugger:
+
+----
+xfs_db> path /a
+xfs_db> ls
+8 131 directory 0x0000002e 1 . (good)
+10 128 directory 0x0000172e 2 .. (good)
+12 132 regular 0x5a1f6ea0 12 autoexec.bat (good)
+xfs_db> path /b
+xfs_db> ls
+8 16777344 directory 0x0000002e 1 . (good)
+10 128 directory 0x0000172e 2 .. (good)
+15 132 regular 0x9a01678c 10 config.sys (good)
+xfs_db> path /b/config.sys
+xfs_db> p a
+a.sfattr.hdr.totsize = 56
+a.sfattr.hdr.count = 2
+a.sfattr.list[0].namelen = 12
+a.sfattr.list[0].valuelen = 12
+a.sfattr.list[0].root = 0
+a.sfattr.list[0].secure = 0
+a.sfattr.list[0].parent = 1
+a.sfattr.list[0].name = "autoexec.bat"
+a.sfattr.list[0].parent_dir.inumber = 131
+a.sfattr.list[0].parent_dir.gen = 3204669414
+a.sfattr.list[1].namelen = 10
+a.sfattr.list[1].valuelen = 12
+a.sfattr.list[1].root = 0
+a.sfattr.list[1].secure = 0
+a.sfattr.list[1].parent = 1
+a.sfattr.list[1].name = "config.sys"
+a.sfattr.list[1].parent_dir.inumber = 16777344
+a.sfattr.list[1].parent_dir.gen = 4137450876
+
+----
+
+In this example, +/a+ and +/b+ are subdirectories of the root. A regular file
+is hardlinked into both subdirectories, under different names. Directory +/a+
+is inode 131 and has an entry +autoexec.bat+ pointing to the child file.
+Directory +/b+ is inode 16777344 and has an entry +config.sys+ pointing to the
+same child file.
+
+Within the child file, notice that there are two parent pointers in the
+extended attribute structure. The first parent pointer tells us that directory
+inode 131 should have an entry +autoexec.bat+ pointing down to the child; the
+second parent pointer tells us that directory inode 16777344 should have an
+entry +config.sys+ pointing down to the child.
+
+== Key Differences Between Directories and Extended Attributes
+
+Directories and extended attributes share the function of mapping names to
+information, but the differences in the functionality requirements applied to
+each type of structure influence their respective internal formats.
+Directories map variable length names to iterable directory entry records
+(dirent records), whereas extended attributes map variable length names to
+non-iterable attribute records. Both structures can take advantage of variable
+length record btree structures (i.e the dabtree) to map name hashes, but there
+are major differences in the way each type of structure integrate the dabtree
+index within the information being stored. The directory dabtree leaf nodes
+contain mappings between a name hash and the location of a dirent record inside
+the directory entry segment. Extended attributes, on the other hand, store
+attribute records directly in the leaf nodes of the dabtree.
+
+When XFS adds or removes an attribute record in any dabtree, it splits or
+merges leaf nodes of the tree based on where the name hash index determines a
+record needs to be inserted into or removed. In the attribute dabtree, XFS
+splits or merges sparse leaf nodes of the dabtree as a side effect of inserting
+or removing attribute records.
+
+Directories, however, are subject to stricter constraints. The userspace
+readdir/seekdir/telldir directory cookie API places a requirement on the
+directory structure that dirent record cookie cannot change for the life of the
+dirent record. XFS uses the dirent record's logical offset into the directory
+data segment as the cookie, and hence the dirent record cannot change location.
+Therefore, XFS cannot store dirent records in the leaf nodes of the dabtree
+because the offset into the tree would change as other entries are inserted and
+removed.
+
+Dirent records are therefore stored within directory data blocks, all of which
+are mapped in the first directory segment. The directory dabtree is mapped
+into the second directory segment. Therefore, directory blocks require
+external free space tracking because they are not part of the dabtree itself.
+Because the dabtree only stores pointers to dirent records in the first data
+segment, there is no need to leave holes in the dabtree itself. The dabtree
+splits or merges leaf nodes as required as pointers to the directory data
+segment are added or removed, and needs no free space tracking.
+
+When XFS adds a dirent record, it needs to find the best-fitting free space in
+the directory data segment to turn into the new record. This requires a free
+space index for the directory data segment. The free space index is held in
+the third directory segment. Once XFS has used the free space index to find
+the block with that best free space, it modifies the directory data block and
+updates the dabtree to point the name hash at the new record. When XFS removes
+dirent records, it leaves hole in the data segment so that the rest of the
+entries do not move, and removes the corresponding dabtree name hash mapping.
+
+Note that for small directories, XFS collapses the name hash mappings and
+the free space information into the directory data blocks to save space.
+
+In summary, the requirement for a free space map in the directory structure
+results from storing the dirent records externally to the dabtree. Attribute
+records are stored directly in the dabtree leaf nodes of the dabtree (except
+for remote attribute values which can be anywhere in the attr fork address
+space) and do not need external free space tracking to determine where to best
+insert them. As a result, extended attributes exhibit nearly perfect scaling
+until the computer runs out of memory.
diff --git a/design/XFS_Filesystem_Structure/fs_properties.asciidoc b/design/XFS_Filesystem_Structure/fs_properties.asciidoc
new file mode 100644
index 0000000..b639aec
--- /dev/null
+++ b/design/XFS_Filesystem_Structure/fs_properties.asciidoc
@@ -0,0 +1,28 @@
+[[Filesystem_Properties]]
+= Filesystem Properties
+
+System administrators can set filesystem-wide properties to coordinate the
+behavior of userspace XFS administration tools. These properties are recorded
+as extended attributes of the +ATTR_ROOT+ namesace that are set on the root
+directory.
+
+[options="header"]
+|=====
+| Property | Description
+| +xfs:autofsck+ | Online fsck background scanning behavior
+|=====
+
+*xfs:autofsck*::
+This property controls the behavior of background online fsck.
+Unrecognized values are treated as if the property was not set.
+Check the +xfs_scrub+ manual page for more information.
+
+.autofsck property values
+[options="header"]
+|=====
+| Value | Description
+| +none+ | Do not perform background scans.
+| +check+ | Only check metadata.
+| +optimize+ | Check and optimize metadata.
+| +repair+ | Check, repair, or optimize metadata.
+|=====
diff --git a/design/XFS_Filesystem_Structure/images/31.png b/design/XFS_Filesystem_Structure/images/31.png
deleted file mode 100644
index 48b0172..0000000
--- a/design/XFS_Filesystem_Structure/images/31.png
+++ /dev/null
Binary files differ
diff --git a/design/XFS_Filesystem_Structure/internal_inodes.asciidoc b/design/XFS_Filesystem_Structure/internal_inodes.asciidoc
index f5c2654..40eb572 100644
--- a/design/XFS_Filesystem_Structure/internal_inodes.asciidoc
+++ b/design/XFS_Filesystem_Structure/internal_inodes.asciidoc
@@ -5,12 +5,134 @@ XFS allocates several inodes when a filesystem is created. These are internal
and not accessible from the standard directory structure. These inodes are only
accessible from the superblock.
+[[Metadata_Directories]]
+== Metadata Directory Tree
+
+If the +XFS_SB_FEAT_INCOMPAT_METADIR+ feature is enabled, the +sb_metadirino+
+field in the superblock points to the root of a directory tree containing
+metadata files. This directory tree is completely internal to the filesystem
+and must not be exposed to user programs.
+
+When this feature is enabled, metadata files should be found by walking the
+metadata directory tree. The superblock fields that formerly pointed to (some)
+of those inodes have been deallocated and may be reused by future features.
+
+.Metadata Directory Paths
+[options="header"]
+|=====
+| Metadata File | Location
+| xref:Quota_Inodes[User Quota] | /quota/user
+| xref:Quota_Inodes[Group Quota] | /quota/group
+| xref:Quota_Inodes[Project Quota] | /quota/project
+| xref:Real-Time_Bitmap_Inode[Realtime Bitmap] | /rtgroups/*.bitmap
+| xref:Real-Time_Summary_Inode[Realtime Summary] | /rtgroups/*.summary
+|=====
+
+Metadata files are flagged by the +XFS_DIFLAG2_METADATA+ flag in the
++di_flags2+ field. Metadata files must have the following properties:
+
+* Must be either a directory or a regular file.
+* chmod 0000
+* User and group IDs set to zero.
+* The +XFS_DIFLAG_IMMUTABLE+, +XFS_DIFLAG_SYNC+, +XFS_DIFLAG_NOATIME+, +XFS_DIFLAG_NODUMP+, and +XFS_DIFLAG_NODEFRAG+ flags must all be set in +di_flags+.
+* For a directory, the +XFS_DIFLAG_NOSYMLINKS+ flag must also be set.
+* The +XFS_DIFLAG2_METADATA+ flag must be set in +di_flags2+.
+* The +XFS_DIFLAG2_DAX+ flag must not be set.
+
+=== Metadata Directory Example
+
+This example shows a metadta directory from a freshly formatted root
+filesystem:
+
+----
+xfs_db> sb 0
+xfs_db> p
+magicnum = 0x58465342
+blocksize = 4096
+dblocks = 5192704
+rblocks = 0
+rextents = 0
+uuid = cbf2ceef-658e-46b0-8f96-785661c37976
+logstart = 4194311
+rootino = 128
+rbmino = 130
+rsumino = 131
+...
+meta_uuid = 00000000-0000-0000-0000-000000000000
+metadirino = 129
+...
+----
+
+Notice how the listing includes the root of the metadata directory tree
+(+metadirino+).
+
+----
+xfs_db> path -m /
+xfs_db> ls
+8 129 directory 0x0000002e 1 . (good)
+10 129 directory 0x0000172e 2 .. (good)
+12 33685632 directory 0x2d18ab4c 8 rtgroups (good)
+----
+
+Here we use the +path+ and +ls+ commands to display the root directory of
+the metadata directory. We can navigate the directory the old way, too:
+
+----
+xfs_db> p
+core.magic = 0x494e
+core.mode = 040000
+core.version = 3
+core.format = 1 (local)
+core.onlink = 0
+core.uid = 0
+core.gid = 0
+...
+v3.flags2 = 0x8000000000000018
+v3.cowextsize = 0
+v3.crtime.sec = Wed Aug 7 10:22:36 2024
+v3.crtime.nsec = 273744000
+v3.inumber = 129
+v3.uuid = 7e55b909-8728-4d69-a1fa-891427314eea
+v3.reflink = 0
+v3.cowextsz = 0
+v3.dax = 0
+v3.bigtime = 1
+v3.nrext64 = 1
+v3.metadata = 1
+u3.sfdir3.hdr.count = 1
+u3.sfdir3.hdr.i8count = 0
+u3.sfdir3.hdr.parent.i4 = 129
+u3.sfdir3.list[0].namelen = 8
+u3.sfdir3.list[0].offset = 0x60
+u3.sfdir3.list[0].name = "rtgroups"
+u3.sfdir3.list[0].inumber.i4 = 33685632
+u3.sfdir3.list[0].filetype = 2
+----
+
+The root of the metadata directory is a short format directory, and looks just
+like any other directory. The only difference is that the metadata flag is
+set, and the directory can only be viewed in the XFS debugger.
+
+----
+xfs_db> path -m /rtgroups/0.rmap
+btdump
+u3.rtrmapbt.recs[1] = [startblock,blockcount,owner,offset,extentflag,attrfork,bmbtblock]
+1:[0,1,-3,0,0,0,0]
+----
+
+Observe that we can use the xfs_db +path+ command to navigate the metadata
+directory tree to the user quota file and display its contents.
+
[[Quota_Inodes]]
== Quota Inodes
-If quotas are used, two inodes are allocated for user and group quota
-management. If project quotas are used, these replace the group quota management
-and therefore uses the group quota inode.
+Prior to version 5 filesystems, two inodes can be allocated for quota
+management. The first inode will be used for user quotas. The second inode
+will be used for group quotas or project quotas, depending on mount options.
+Group and project quotas are mutually exclusive features in these environments.
+
+In version 5 or later filesystems, each quota type is allocated its own inode,
+making it possible to use group and project quota management simultaneously.
* Project quota's primary purpose is to track and monitor disk usage for
directories. For this to occur, the directory inode must have the
@@ -28,7 +150,7 @@ multiplied by the size of +xfs_dqblk_t+ (136 bytes).
.Quota inode layout
image::images/76.png[]
-Quota information is stored in the data extents of the two reserved quota
+Quota information is stored in the data extents of the reserved quota
inodes as an array of the +xfs_dqblk+ structures, where there is one array
element for each ID in the system:
@@ -124,6 +246,11 @@ limit will turn into a hard limit after the elapsed time exceeds ID zero's
+d_itimer+ value. When d_icount goes back below +d_ino_softlimit+, +d_itimer+
is reset back to zero.
+If the +XFS_SB_FEAT_INCOMPAT_BIGTIME+ feature is enabled, the 32 bits used by
+the timestamp field are interpreted as the upper 32 bits of an 34-bit unsigned
+seconds counter. See the section about xref:Quota_Timers[quota expiration
+timers] for more details.
+
*d_btimer*::
Specifies the time when the ID's +d_bcount+ exceeded +d_blk_softlimit+. The soft
limit will turn into a hard limit after the elapsed time exceeds ID zero's
@@ -165,41 +292,9 @@ Log sequence number of the last DQ block write.
*dd_crc*::
Checksum of the DQ block.
-
[[Real-time_Inodes]]
== Real-time Inodes
There are two inodes allocated to managing the real-time device's space, the
-Bitmap Inode and the Summary Inode.
-
-[[Real-Time_Bitmap_Inode]]
-=== Real-Time Bitmap Inode
-
-The real time bitmap inode, +sb_rbmino+, tracks the used/free space in the
-real-time device using an old-style bitmap. One bit is allocated per real-time
-extent. The size of an extent is specified by the superblock's +sb_rextsize+
-value.
-
-The number of blocks used by the bitmap inode is equal to the number of
-real-time extents (+sb_rextents+) divided by the block size (+sb_blocksize+)
-and bits per byte. This value is stored in +sb_rbmblocks+. The nblocks and
-extent array for the inode should match this. Each real time block gets its
-own bit in the bitmap.
-
-[[Real-Time_Summary_Inode]]
-=== Real-Time Summary Inode
-
-The real time summary inode, +sb_rsumino+, tracks the used and free space
-accounting information for the real-time device. This file indexes the
-approximate location of each free extent on the real-time device first by
-log2(extent size) and then by the real-time bitmap block number. The size of
-the summary inode file is equal to +sb_rbmblocks+ × log2(realtime device size)
-× sizeof(+xfs_suminfo_t+). The entry for a given log2(extent size) and
-rtbitmap block number is 0 if there is no free extents of that size at that
-rtbitmap location, and positive if there are any.
-
-This data structure is not particularly space efficient, however it is a very
-fast way to provide the same data as the two free space B+trees for regular
-files since the space is preallocated and metadata maintenance is minimal.
-
-include::rtrmapbt.asciidoc[]
+xref:Real-Time_Bitmap_Inode[Bitmap Inode] and the
+xref:Real-Time_Summary_Inode[Summary Inode].
diff --git a/design/XFS_Filesystem_Structure/journaling_log.asciidoc b/design/XFS_Filesystem_Structure/journaling_log.asciidoc
index 6109458..6b9d65c 100644
--- a/design/XFS_Filesystem_Structure/journaling_log.asciidoc
+++ b/design/XFS_Filesystem_Structure/journaling_log.asciidoc
@@ -215,6 +215,10 @@ magic number to distinguish themselves. Buffer data items only appear after
| +XFS_LI_CUD+ | 0x1243 | xref:CUD_Log_Item[Reference Count Update Done]
| +XFS_LI_BUI+ | 0x1244 | xref:BUI_Log_Item[File Block Mapping Update Intent]
| +XFS_LI_BUD+ | 0x1245 | xref:BUD_Log_Item[File Block Mapping Update Done]
+| +XFS_LI_ATTRI+ | 0x1246 | xref:ATTRI_Log_Item[Extended Attribute Update Intent]
+| +XFS_LI_ATTRD+ | 0x1247 | xref:ATTRD_Log_Item[Extended Attribute Update Done]
+| +XFS_LI_XMI+ | 0x1248 | xref:XMI_Log_Item[File Mapping Exchange Intent]
+| +XFS_LI_XMD+ | 0x1249 | xref:XMD_Log_Item[File Mapping Exchange Done]
|=====
Note that all log items (except for transaction headers) MUST start with
@@ -647,6 +651,8 @@ file block mapping operation we want. The upper three bytes are flag bits.
| Value | Description
| +XFS_BMAP_EXTENT_ATTR_FORK+ | Extent is for the attribute fork.
| +XFS_BMAP_EXTENT_UNWRITTEN+ | Extent is unwritten.
+| +XFS_BMAP_EXTENT_REALTIME+ | Mapping applies to the data fork of a
+realtime file. This flag cannot be combined with +XFS_BMAP_EXTENT_ATTR_FORK+.
|=====
The ``file block mapping update intent'' operation comes first; it tells the
@@ -712,6 +718,274 @@ Size of this log item. Should be 1.
*bud_bui_id*::
A 64-bit number that binds the corresponding BUI log item to this BUD log item.
+[[ATTRI_Log_Item]]
+=== Extended Attribute Update Intent
+
+The next two operation types work together to handle atomic extended attribute
+updates.
+
+The lower byte of the +alfi_op_flags+ field is a type code indicating what sort
+of file block mapping operation we want.
+
+.Extended attribute update log intent types
+[options="header"]
+|=====
+| Value | Description
+| +XFS_ATTRI_OP_FLAGS_SET+ | Associate an attribute name with the
+given value, creating an entry for the name if necessary.
+| +XFS_ATTRI_OP_FLAGS_REMOVE+ | Remove an attribute name and any
+value associated with it.
+| +XFS_ATTRI_OP_FLAGS_REPLACE+ | Remove any value associated with an
+attribute name, then associate the name with the given value.
+| +XFS_ATTRI_OP_FLAGS_PPTR_SET+ | Add a parent pointer associating a directory entry name with a file handle to the parent directory. The (name, handle) tuple must not exist in the attribute structure.
+| +XFS_ATTRI_OP_FLAGS_PPTR_REMOVE+ | Remove a parent pointer from the attribute structure. The (name, handle) tuple must already exist.
+| +XFS_ATTRI_OP_FLAGS_PPTR_REPLACE+ | Remove a specific (name, handle) tuple from
+the attribute structure, then add a new (name, handle) tuple to the attribute structure.
+The two names and handles need not be the same.
+|=====
+
+The ``extended attribute update intent'' operation comes first; it tells the
+log that XFS wants to update one of a file's extended attributes. This record
+is crucial for correct log recovery because it enables us to spread a complex
+metadata update across multiple transactions while ensuring that a crash midway
+through the complex update will be replayed fully during log recovery.
+
+[source, c]
+----
+struct xfs_attri_log_format {
+ uint16_t alfi_type;
+ uint16_t alfi_size;
+ uint32_t alfi_igen;
+ uint64_t alfi_id;
+ uint64_t alfi_ino;
+ uint32_t alfi_op_flags;
+ union {
+ uint32_t alfi_name_len;
+ struct {
+ uint16_t alfi_old_name_len;
+ uint16_t alfi_new_name_len;
+ };
+ };
+ uint32_t alfi_value_len;
+ uint32_t alfi_attr_filter;
+};
+----
+
+*alfi_type*::
+The signature of an ATTRI operation, 0x1246. This value is in host-endian
+order, not big-endian like the rest of XFS.
+
+*alfi_size*::
+Size of this log item. Should be 1.
+
+*alfi_igen*::
+Generation number of the file being updated.
+
+*alfi_id*::
+A 64-bit number that binds the corresponding ATTRD log item to this ATTRI log
+item.
+
+*alfi_ino*::
+Inode number of the file being updated.
+
+*alfi_op_flags*::
+The operation being performed. The lower byte must be one of the
++XFS_ATTRI_OP_FLAGS_*+ flags defined above. The upper bytes must be zero.
+
+*alfi_name_len*::
+Length of the name of the extended attribute. This must not be zero.
+The attribute name itself is captured in the next log item.
+This field is not defined for the PPTR_REPLACE opcode.
+
+*alfi_old_name_len*::
+For PPTR_REPLACE, this is the length of the old name.
+
+*alfi_new_name_len*::
+For PPTR_REPLACE, this is the length of the new name.
+
+*alfi_value_len*::
+Length of the value of the extended attribute. This must be zero for remove
+operations, and nonzero for set and replace operations. The attribute value
+itself is captured in the log item immediately after the item containing the
+name.
+
+*alfi_attr_filter*::
+Attribute namespace filter flags. This must be one of +ATTR_ROOT+,
++ATTR_SECURE+, or +ATTR_INCOMPLETE+.
+
+For a SET or REPLACE opcode, there should be two regions after the ATTRI intent
+item. The first region contains the attribute name and the second contains the
+attribute value.
+
+For a REMOVE opcode, there should only be one region after the ATTRI intent
+item, and it will contain the attribute name.
+
+For an PPTR_SET or PPTR_REMOVE opcode, there should be two regions after the
+ATTRI intent item. The first region contains the dirent name as the attribute
+name. The second region contains a file handle to the parent directory as the
+attribute value.
+
+For an PPTR_REPLACE opcode, there should be between four regions after the
+ATTRI intent item. The first region contains the dirent name to remove.
+The second region contains the dirent name to create. The third region
+contains the parent directory file handle to remove. The fourth region
+contains the parent directory file handle to add.
+
+[[ATTRD_Log_Item]]
+=== Completion of Extended Attribute Updates
+
+The ``extended attribute update done'' operation complements the ``extended
+attribute update intent'' operation. This second operation indicates that the
+update actually happened, so that log recovery needn't replay the update. The
+ATTRD and the actual updates are typically found in a new transaction following
+the transaction in which the ATTRI was logged.
+
+[source, c]
+----
+struct xfs_attrd_log_format {
+ __uint16_t alfd_type;
+ __uint16_t alfd_size;
+ __uint32_t __pad;
+ __uint64_t alfd_alf_id;
+};
+----
+
+*alfd_type*::
+The signature of an ATTRD operation, 0x1247. This value is in host-endian
+order, not big-endian like the rest of XFS.
+
+*alfd_size*::
+Size of this log item. Should be 1.
+
+*alfd_alf_id*::
+A 64-bit number that binds the corresponding ATTRI log item to this ATTRD log
+item.
+
+=== Extended Attribute Name and Value
+
+These regions contain the name and value components of the extended attribute
+being updated, as needed. There are no magic numbers; each region contains the
+data and nothing else.
+
+[[XMI_Log_Item]]
+=== File Mapping Exchange Intent
+
+These two log items work together to track the exchange of mapped extents
+between the forks of two files. Each operation requires a separate XMI/XMD
+pair. The log intent item has the following format:
+
+[source, c]
+----
+struct xfs_xmi_log_format {
+ uint16_t xmi_type;
+ uint16_t xmi_size;
+ uint32_t __pad;
+ uint64_t xmi_id;
+ uint64_t xmi_inode1;
+ uint64_t xmi_inode2;
+ uint32_t xmi_igen1;
+ uint32_t xmi_igen2;
+ uint64_t xmi_startoff1;
+ uint64_t xmi_startoff2;
+ uint64_t xmi_blockcount;
+ uint64_t xmi_flags;
+ int64_t xmi_isize1;
+ int64_t xmi_isize2;
+};
+----
+
+*xmi_type*::
+The signature of an XMI operation, 0x1248. This value is in host-endian order,
+not big-endian like the rest of XFS.
+
+*xmi_size*::
+Size of this log item. Should be 1.
+
+*__pad*::
+Must be zero.
+
+*xmi_id*::
+A 64-bit number that binds the corresponding XMD log item to this XMI log item.
+
+*xmi_inode1*::
+Inode number of the first file involved in the operation.
+
+*xmi_inode2*::
+Inode number of the second file involved in the operation.
+
+*xmi_igen1*::
+Generation number of the first file involved in the operation.
+
+*xmi_igen2*::
+Generation number of the second file involved in the operation.
+
+*xmi_startoff1*::
+Starting point within the first file, in units of filesystem blocks.
+
+*xmi_startoff2*::
+Starting point within the second file, in units of filesystem blocks.
+
+*xmi_blockcount*::
+The length to be exchanged, in units of filesystem blocks.
+
+*xmi_flags*::
+Behavioral changes to the operation, as follows:
+
+.File Extent Swap Intent Item Flags
+[options="header"]
+|=====
+| Value | Description
+| +XFS_EXCHMAPS_ATTR_FORK+ | Exchange extents between attribute forks.
+| +XFS_EXCHMAPS_SET_SIZES+ | Exchange the file sizes of the two files
+after the operation completes.
+| +XFS_EXCHMAPS_INO1_WRITTEN+ | Exchange the mappings of two files only
+if the file allocation units mapped to file1's range have been written.
+| +XFS_EXCHMAPS_CLEAR_INO1_REFLINK+ | Clear the reflink flag from inode1 after
+the operation.
+| +XFS_EXCHMAPS_CLEAR_INO2_REFLINK+ | Clear the reflink flag from inode2 after
+the operation.
+|=====
+
+*xmi_isize1*::
+The original size of the first file, in bytes. This is zero if the
++XFS_EXCHMAPS_SET_SIZES+ flag is not set.
+
+*xmi_isize2*::
+The original size of the second file, in bytes. This is zero if the
++XFS_EXCHMAPS_SET_SIZES+ flag is not set.
+
+[[XMD_Log_Item]]
+=== Completion of File Mapping Exchange
+
+The ``file mapping exchange done'' operation complements the ``file mapping
+exchange intent'' operation. This second operation indicates that the update
+actually happened, so that log recovery needn't replay the update. The XMD
+item and the actual updates are typically found in a new transaction following
+the transaction in which the XMI was logged. The completion has this format:
+
+[source, c]
+----
+struct xfs_xmd_log_format {
+ uint16_t xmd_type;
+ uint16_t xmd_size;
+ uint32_t __pad;
+ uint64_t xmd_xmi_id;
+};
+----
+
+*xmd_type*::
+The signature of an XMD operation, 0x1249. This value is in host-endian order,
+not big-endian like the rest of XFS.
+
+*xmd_size*::
+Size of this log item. Should be 1.
+
+*__pad*::
+Must be zero.
+
+*xmd_xmi_id*::
+A 64-bit number that binds the corresponding XMI log item to this XMD log item.
+
[[Inode_Log_Item]]
=== Inode Updates
@@ -810,7 +1084,7 @@ missing the +ilf_pad+ field and is 52 bytes long as opposed to 56 bytes.
This region contains the new contents of a part of an inode, as described in
the xref:Inode_Log_Item[previous section]. There are no magic numbers.
-If +XFS_ILOG_CORE+ is set in +ilf_fields+, the correpsonding data buffer must
+If +XFS_ILOG_CORE+ is set in +ilf_fields+, the corresponding data buffer must
be in the format +struct xfs_icdinode+, which has the same format as the first
96 bytes of an xref:On-disk_Inode[inode], but is recorded in host byte order.
@@ -867,7 +1141,7 @@ The size of +blf_data_map+, in 32-bit words.
This variable-sized array acts as a dirty bitmap for the logged buffer. Each
1 bit represents a dirty region in the buffer, and each run of 1 bits
corresponds to a subsequent log item containing the new contents of the buffer
-area. Each bit represents +(blf_len * 512) / (blf_map_size * NBBY)+ bytes.
+area. Each bit represents +XFS_BLF_CHUNK+ (i.e. 128) bytes.
[[Buffer_Data_Log_Item]]
=== Buffer Data Log Item
diff --git a/design/XFS_Filesystem_Structure/magic.asciidoc b/design/XFS_Filesystem_Structure/magic.asciidoc
index 7e62783..5da29b9 100644
--- a/design/XFS_Filesystem_Structure/magic.asciidoc
+++ b/design/XFS_Filesystem_Structure/magic.asciidoc
@@ -45,9 +45,12 @@ relevant chapters. Magic numbers tend to have consistent locations:
| +XFS_ATTR3_LEAF_MAGIC+ | 0x3bee | | xref:Leaf_Attributes[Leaf Attribute], v5 only
| +XFS_ATTR3_RMT_MAGIC+ | 0x5841524d | XARM | xref:Remote_Values[Remote Attribute Value], v5 only
| +XFS_RMAP_CRC_MAGIC+ | 0x524d4233 | RMB3 | xref:Reverse_Mapping_Btree[Reverse Mapping B+tree], v5 only
+| +XFS_RTBITMAP_MAGIC+ | 0x424D505A | BMPZ | xref:Real-Time_Bitmap_Inode[Real-Time Bitmap], metadir only
+| +XFS_RTSUMMARY_MAGIC+ | 0x53554D59 | SUMY | xref:Real-Time_Summary_Inode[Real-Time Summary], metadir only
| +XFS_RTRMAP_CRC_MAGIC+ | 0x4d415052 | MAPR | xref:Real_time_Reverse_Mapping_Btree[Real-Time Reverse Mapping B+tree], v5 only
| +XFS_REFC_CRC_MAGIC+ | 0x52334643 | R3FC | xref:Reference_Count_Btree[Reference Count B+tree], v5 only
| +XFS_MD_MAGIC+ | 0x5846534d | XFSM | xref:Metadata_Dumps[Metadata Dumps]
+| +XFS_RTSB_MAGIC+ | 0x46726F67 | Frog | xref:Realtime_Groups[Realtime Groups]
|=====
The magic numbers for log items are at offset zero in each log item, but items
@@ -71,6 +74,10 @@ are not aligned to blocks.
| +XFS_LI_CUD+ | 0x1243 | | xref:CUD_Log_Item[Reference Count Update Done]
| +XFS_LI_BUI+ | 0x1244 | | xref:BUI_Log_Item[File Block Mapping Update Intent]
| +XFS_LI_BUD+ | 0x1245 | | xref:BUD_Log_Item[File Block Mapping Update Done]
+| +XFS_LI_ATTRI+ | 0x1246 | | xref:ATTRI_Log_Item[Extended Attribute Update Intent]
+| +XFS_LI_ATTRD+ | 0x1247 | | xref:ATTRD_Log_Item[Extended Attribute Update Done]
+| +XFS_LI_XMI+ | 0x1248 | | xref:XMI_Log_Item[File Mapping Exchange Intent]
+| +XFS_LI_XMD+ | 0x1249 | | xref:XMD_Log_Item[File Mapping Exchange Done]
|=====
= Theoretical Limits
@@ -92,5 +99,5 @@ XFS can create really big filesystems!
| Max Dir Size | 32GiB | 32GiB | 32GiB
|=====
-Linux doesn't suppport files or devices larger than 8EiB, so the block
+Linux doesn't support files or devices larger than 8EiB, so the block
limitations are largely ignorable.
diff --git a/design/XFS_Filesystem_Structure/metadata_integrity.asciidoc b/design/XFS_Filesystem_Structure/metadata_integrity.asciidoc
deleted file mode 100644
index f948d5e..0000000
--- a/design/XFS_Filesystem_Structure/metadata_integrity.asciidoc
+++ /dev/null
@@ -1,36 +0,0 @@
-= Metadata Integrity
-
-Prior to version 5, most XFS metadata blocks contained a magic number that
-could provide a minimal sanity check that a block read off the disk contained
-the same type of data that the code thought it was reading off the disk.
-However, this was insufficient -- given a correct type code, it was still
-impossible to tell if the block was from a previous filesystem, or happened to
-be owned by something else, or had been written to the wrong location on disk.
-Furthermore, not all metadata blocks had magic numbers -- remote extended
-attributes and extent symbolic links had no protection at all.
-
-Therefore, the version 5 disk format introduced larger headers for all metadata
-types, which enable the filesystem to check information being read from the
-disk more rigorously. Metadata integrity fields now include:
-
-* *Magic* numbers, to classify all types of metadata. This is unchanged from v4.
-* A copy of the filesystem *UUID*, to confirm that a given disk block is connected to the superblock.
-* The *owner*, to avoid accessing a piece of metadata which belongs to some other part of the filesystem.
-* The filesystem *block number*, to detect misplaced writes.
-* The *log serial number* of the last write to this block, to avoid replaying obsolete log entries.
-* A CRC32c *checksum* of the entire block, to detect minor corruption.
-
-Metadata integrity coverage has been extended to all metadata blocks in the
-filesystem, with the following notes:
-
-* Inodes can have multiple ``owners'' in the directory tree; therefore the record contains the inode number instead of an owner or a block number.
-* Superblocks have no owners.
-* The disk quota file has no owner or block numbers.
-* Metadata owned by files list the inode number as the owner.
-* Per-AG data and B+tree blocks list the AG number as the owner.
-* Per-AG header sectors don't list owners or block numbers, since they have fixed locations.
-* Remote attribute blocks are not logged and therefore the LSN must be -1.
-
-This functionality enables XFS to decide that a block contents are so
-unexpected that it should stop immediately. Unfortunately checksums do not
-allow for automatic correction. Please keep regular backups, as always.
diff --git a/design/XFS_Filesystem_Structure/metadump.asciidoc b/design/XFS_Filesystem_Structure/metadump.asciidoc
index 2bddb77..226622c 100644
--- a/design/XFS_Filesystem_Structure/metadump.asciidoc
+++ b/design/XFS_Filesystem_Structure/metadump.asciidoc
@@ -6,6 +6,9 @@ snapshot of a live file system and to restore that snapshot onto a block
device for debugging purposes. Only the metadata are captured in the
snapshot, and the metadata blocks may be obscured for privacy reasons.
+[[Metadump_v1]]
+== Metadump v1
+
A metadump file starts with a +xfs_metablock+ that records the addresses of
the blocks that follow. Following that are the metadata blocks captured
from the filesystem. The first block following the first superblock
@@ -21,7 +24,7 @@ struct xfs_metablock {
__be32 mb_magic;
__be16 mb_count;
uint8_t mb_blocklog;
- uint8_t mb_reserved;
+ uint8_t mb_info;
__be64 mb_daddr[];
};
----
@@ -37,14 +40,127 @@ Number of blocks indexed by this record. This value must not exceed +(1
The log size of a metadump block. This size of a metadump block 512
bytes, so this value should be 9.
-*mb_reserved*::
-Reserved. Should be zero.
+*mb_info*::
+A combination of the following flags:
+
+.Metadump information flags
+[options="header"]
+|=====
+| Flag | Description
+| +XFS_METADUMP_INFO_FLAGS+ |
+This field is nonzero.
+
+| +XFS_METADUMP_OBFUSCATED+ |
+User-supplied directory entry and extended attribute names have been obscured,
+and extended attribute values are zeroed to protect privacy.
+
+| +XFS_METADUMP_FULLBLOCKS+ |
+Entire metadata blocks have been dumped, including unused areas.
+If not set, the unused areas are zeroed.
+
+| +XFS_METADUMP_DIRTYLOG+ |
+The log was dirty when the dump was captured.
+
+|=====
*mb_daddr*::
An array of disk addresses. Each of the +mb_count+ blocks (of size +(1
<< mb_blocklog+) following the +xfs_metablock+ should be written back to
the address pointed to by the corresponding +mb_daddr+ entry.
+[[Metadump_v2]]
+== Metadump v2
+
+A v2 metadump file starts with a +xfs_metadump_header+ structure that records
+information about the dump itself. Immediately after this header is a sequence
+of a +xfs_meta_extent+ structure describing an extent of data and the data
+itself. Data areas must be a multiple of 512 bytes in length.
+
+.Metadata v2 Dump Format
+
+[source, c]
+----
+struct xfs_metadump_header {
+ __be32 xmh_magic;
+ __be32 xmh_version;
+ __be32 xmh_compat_flags;
+ __be32 xmh_incompat_flags;
+ __be64 xmh_reserved;
+} __packed;
+----
+
+*xmh_magic*::
+The magic number, ``XMD2'' (0x584D4432).
+
+*xmh_version*::
+The value 2.
+
+*xmh_compat_flags*::
+A combination of the following flags:
+
+.Metadump v2 compat flags
+[options="header"]
+|=====
+| Flag | Description
+| +XFS_MD2_COMPAT_OBFUSCATED+ |
+User-supplied directory entry and extended attribute names have been obscured,
+and extended attribute values are zeroed to protect privacy.
+
+| +XFS_MD2_COMPAT_FULLBLOCKS+ |
+Entire metadata blocks have been dumped, including unused areas.
+If not set, the unused areas are zeroed.
+
+| +XFS_MD2_COMPAT_DIRTYLOG+ |
+The log was dirty when the dump was captured.
+
+| +XFS_MD2_COMPAT_EXTERNALLOG+ |
+Dump contains external log contents.
+
+|=====
+
+*xmh_incompat_flags*::
+A combination of the following flags:
+
+.Metadump v2 incompat flags
+[options="header"]
+|=====
+| Flag | Description
+| +XFS_MD2_INCOMPAT_RTDEVICE+ |
+Dump contains realtime device contents.
+
+|=====
+
+*xmh_reserved*::
+Must be zero.
+
+.Metadata v2 Extent Format
+
+[source, c]
+----
+struct xfs_meta_extent {
+ __be64 xme_addr;
+ __be32 xme_len;
+} __packed;
+----
+
+*xme_addr*::
+Bits 55-56 determine the device from which the metadata dump data was extracted.
+
+.Metadump v2 extent flags
+[options="header"]
+|=====
+| Value | Description
+| 0 | Data device
+| 1 | External log
+| 2 | Realtime device
+|=====
+
+The lower 54 bits determine the device address from which the dump data was
+extracted, in units of 512 bytes.
+
+*xme_length*::
+Length of the metadata dump data region, in units of 512 bytes.
+
== Dump Obfuscation
Unless explicitly disabled, the +xfs_metadump+ tool obfuscates empty block
diff --git a/design/XFS_Filesystem_Structure/ondisk_inode.asciidoc b/design/XFS_Filesystem_Structure/ondisk_inode.asciidoc
index 02d44ac..6e52e5f 100644
--- a/design/XFS_Filesystem_Structure/ondisk_inode.asciidoc
+++ b/design/XFS_Filesystem_Structure/ondisk_inode.asciidoc
@@ -78,20 +78,50 @@ struct xfs_dinode_core {
__uint16_t di_mode;
__int8_t di_version;
__int8_t di_format;
- __uint16_t di_onlink;
+ union {
+ __uint16_t di_onlink;
+ __uint16_t di_metatype;
+ };
__uint32_t di_uid;
__uint32_t di_gid;
__uint32_t di_nlink;
__uint16_t di_projid;
__uint16_t di_projid_hi;
- __uint8_t di_pad[6];
- __uint16_t di_flushiter;
+ union {
+ /* Number of data fork extents if NREXT64 is set */
+ __be64 di_big_nextents;
+
+ /* Padding for V3 inodes without NREXT64 set. */
+ __be64 di_v3_pad;
+
+ /* Padding and inode flush counter for V2 inodes. */
+ struct {
+ __u8 di_v2_pad[6];
+ __be16 di_flushiter;
+ };
+ };
xfs_timestamp_t di_atime;
xfs_timestamp_t di_mtime;
xfs_timestamp_t di_ctime;
xfs_fsize_t di_size;
xfs_rfsblock_t di_nblocks;
xfs_extlen_t di_extsize;
+ union {
+ /*
+ * For V2 inodes and V3 inodes without NREXT64 set, this
+ * is the number of data and attr fork extents.
+ */
+ struct {
+ __be32 di_nextents;
+ __be16 di_anextents;
+ } __packed;
+
+ /* Number of attr fork extents if NREXT64 is set. */
+ struct {
+ __be32 di_big_anextents;
+ __be16 di_nrext64_pad;
+ } __packed;
+ } __packed;
xfs_extnum_t di_nextents;
xfs_aextnum_t di_anextents;
__uint8_t di_forkoff;
@@ -161,8 +191,24 @@ In v1 inodes, this specifies the number of links to the inode from directories.
When the number exceeds 65535, the inode is converted to v2 and the link count
is stored in +di_nlink+.
+*di_metatype*::
+If the +XFS_SB_FEAT_INCOMPAT_METADIR+ feature is enabled, the +di_onlink+ field
+is redefined to declare the intended contents of files in the metadata
+directory tree.
+
+[source, c]
+----
+enum xfs_metafile_type {
+ XFS_METAFILE_USRQUOTA,
+ XFS_METAFILE_GRPQUOTA,
+ XFS_METAFILE_PRJQUOTA,
+ XFS_METAFILE_RTBITMAP,
+ XFS_METAFILE_RTSUMMARY,
+};
+----
+
*di_uid*::
-Specifies the owner's UID of the inode.
+Specifies the owner's UID of the inode.
*di_gid*::
Specifies the owner's GID of the inode.
@@ -181,10 +227,17 @@ Specifies the high 16 bits of the owner's project ID in v2 inodes, if the
+XFS_SB_VERSION2_PROJID32BIT+ feature is set; and zero otherwise.
*di_pad[6]*::
-Reserved, must be zero.
+Reserved, must be zero. Only exists for v2 inodes.
*di_flushiter*::
-Incremented on flush.
+Incremented on flush. Only exists for v2 inodes.
+
+*di_v3_pad*::
+Must be zero for v3 inodes without the NREXT64 flag set.
+
+*di_big_nextents*::
+Specifies the number of data extents associated with this inode if the NREXT64
+flag is set. This allows for up to 2^48^ - 1 extent mappings.
*di_atime*::
@@ -200,6 +253,10 @@ struct xfs_timestamp {
};
----
+If the +XFS_SB_FEAT_INCOMPAT_BIGTIME+ feature is enabled, the 64 bits used by
+the timestamp field are interpreted as a flat 64-bit nanosecond counter.
+See the section about xref:Inode_Timestamps[inode timestamps] for more details.
+
*di_mtime*::
Specifies the last time the file was modified.
@@ -227,10 +284,19 @@ file is written to beyond allocated space, XFS will attempt to allocate
additional disk space based on this value.
*di_nextents*::
-Specifies the number of data extents associated with this inode.
+Specifies the number of data extents associated with this inode if the NREXT64
+flag is not set. Supports up to 2^31^ - 1 extents.
*di_anextents*::
-Specifies the number of extended attribute extents associated with this inode.
+Specifies the number of extended attribute extents associated with this inode
+if the NREXT64 flag is not set. Supports up to 2^15^ - 1 extents.
+
+*di_big_anextents*::
+Specifies the number of extended attribute extents associated with this inode
+if the NREXT64 flag is set. Supports up to 2^32^ - 1 extents.
+
+*di_nrext64_pad*::
+Must be zero if the NREXT64 flag is set.
*di_forkoff*::
Specifies the offset into the inode's literal area where the extended attribute
@@ -332,6 +398,16 @@ This inode shares (or has shared) data blocks with another inode.
For files, this is the extent size hint for copy on write operations; see
+di_cowextsize+ for details. For directories, the value in +di_cowextsize+
will be copied to all newly created files and directories.
+| +XFS_DIFLAG2_NREXT64+ |
+Files with this flag set may have up to (2^48^ - 1) extents mapped to the data
+fork and up to (2^32^ - 1) extents mapped to the attribute fork. This flag
+requires the +XFS_SB_FEAT_INCOMPAT_NREXT64+ feature to be enabled.
+| +XFS_DIFLAG2_METADATA+ |
+This file contains filesystem metadata. This feature requires the
++XFS_SB_FEAT_INCOMPAT_METADIR+ feature to be enabled. See the section about
+xref:Metadata_Directories[metadata directories] for more information on
+metadata inode properties. Only directories and regular files can have this
+flag set.
|=====
*di_cowextsize*::
diff --git a/design/XFS_Filesystem_Structure/overview.asciidoc b/design/XFS_Filesystem_Structure/overview.asciidoc
index d15b50a..7628a7d 100644..100755
--- a/design/XFS_Filesystem_Structure/overview.asciidoc
+++ b/design/XFS_Filesystem_Structure/overview.asciidoc
@@ -28,7 +28,7 @@ record. Both forks associate a logical offset with an extent of physical
blocks, which makes sparse files and directories possible. Directory entries
and extended attributes are contained inside a second-level data structure
within the blocks that are mapped by the forks. This structure consists of
-variable-length directory or attribute records and possible a second B+tree to
+variable-length directory or attribute records and, possibly, a second B+tree to
index these records.
XFS employs a journalling log in which metadata changes are collected so that
diff --git a/design/XFS_Filesystem_Structure/realtime.asciidoc b/design/XFS_Filesystem_Structure/realtime.asciidoc
new file mode 100644
index 0000000..3a72eb5
--- /dev/null
+++ b/design/XFS_Filesystem_Structure/realtime.asciidoc
@@ -0,0 +1,394 @@
+[[Real-time_Devices]]
+= Real-time Devices
+
+The performance of the standard XFS allocator varies depending on the internal
+state of the various metadata indices enabled on the filesystem. For
+applications which need to minimize the jitter of allocation latency, XFS
+supports the notion of a ``real-time device''. This is a special device
+separate from the regular filesystem where extent allocations are tracked with
+a bitmap and free space is indexed with a two-dimensional array. If an inode
+is flagged with +XFS_DIFLAG_REALTIME+, its data will live on the real time
+device.
+
+By placing the real time device (and the journal) on separate high-performance
+storage devices, it is possible to reduce most of the unpredictability in I/O
+response times that come from metadata operations.
+
+None of the XFS per-AG B+trees are involved with real time files. It is not
+possible for real time files to share data blocks.
+
+[[Real-Time_Bitmap_Inode]]
+== Free Space Bitmap Inode
+
+The real time bitmap inode, +sb_rbmino+, tracks the used/free space in the
+real-time device using an old-style bitmap. One bit is allocated per real-time
+extent. The size of an extent is specified by the superblock's +sb_rextsize+
+value.
+
+The number of blocks used by the bitmap inode is equal to the number of
+real-time extents (+sb_rextents+) divided by the block size (+sb_blocksize+)
+and bits per byte. This value is stored in +sb_rbmblocks+. The nblocks and
+extent array for the inode should match this. Each real time block gets its
+own bit in the bitmap.
+
+If the +XFS_SB_FEAT_INCOMPAT_METADIR+ feature is enabled, each block of the
+realtime bitmap file has a header of the following format:
+
+[source, c]
+----
+struct xfs_rtbuf_blkinfo {
+ __be32 rt_magic;
+ __be32 rt_crc;
+ __be64 rt_owner;
+ __be64 rt_blkno;
+ __be64 rt_lsn;
+ uuid_t rt_uuid;
+};
+----
+
+*rt_magic*::
+Specifies the magic number for the rtbitmap block: ``BMPZ'' (0x424D505A).
+
+*rt_crc*::
+Checksum of the block.
+
+*rt_owner*::
+Specifies the inode number for the file that owns this block.
+
+*rt_blkno*::
+Disk address of this block.
+
+*rt_lsn*::
+Log sequence number of the last write to this block.
+
+*rt_uuid*::
+The UUID of this block, which must match either +sb_uuid+ or +sb_meta_uuid+
+depending on which features are set.
+
+After the block header, the bitmap data are encoded as be32 word values.
+
+=== xfs_db rtbitmap Example
+
+This example shows a real-time bitmap file from a freshly populated filesystem:
+
+----
+xfs_db> path -m /rtgroups/3.bitmap
+xfs_db> p
+core.magic = 0x494e
+core.mode = 0100000
+core.version = 3
+core.format = 2 (extents)
+core.metatype = 5 (rtbitmap)
+core.uid = 0
+core.gid = 0
+core.nlinkv2 = 1
+core.projid_lo = 3
+core.projid_hi = 0
+core.nextents = 1
+core.atime.sec = Tue Oct 15 16:04:02 2024
+core.atime.nsec = 769675000
+core.mtime.sec = Tue Oct 15 16:04:02 2024
+core.mtime.nsec = 769675000
+core.ctime.sec = Tue Oct 15 16:04:02 2024
+core.ctime.nsec = 769681000
+core.size = 135168
+core.nblocks = 33
+core.extsize = 0
+core.naextents = 0
+core.forkoff = 24
+core.aformat = 1 (local)
+core.dmevmask = 0
+core.dmstate = 0
+core.newrtbm = 0
+core.prealloc = 0
+core.realtime = 0
+core.immutable = 1
+core.append = 0
+core.sync = 1
+core.noatime = 1
+core.nodump = 1
+core.rtinherit = 0
+core.projinherit = 0
+core.nosymlinks = 0
+core.extsz = 0
+core.extszinherit = 0
+core.nodefrag = 1
+core.filestream = 0
+core.gen = 2653591217
+next_unlinked = null
+v3.crc = 0x34a17119 (correct)
+v3.change_count = 3
+v3.lsn = 0
+v3.flags2 = 0x38
+v3.cowextsize = 0
+v3.crtime.sec = Tue Oct 15 16:04:02 2024
+v3.crtime.nsec = 769675000
+v3.inumber = 33685633
+v3.uuid = a6575f59-1514-445e-883e-211b2c5a0f05
+v3.reflink = 0
+v3.cowextsz = 0
+v3.dax = 0
+v3.bigtime = 1
+v3.nrext64 = 1
+v3.metadata = 1
+u3.bmx[0] = [startoff,startblock,blockcount,extentflag]
+0:[0,4210712,33,0]
+a.sfattr.hdr.totsize = 27
+a.sfattr.hdr.count = 1
+a.sfattr.list[0].namelen = 8
+a.sfattr.list[0].valuelen = 12
+a.sfattr.list[0].root = 0
+a.sfattr.list[0].secure = 0
+a.sfattr.list[0].parent = 1
+a.sfattr.list[0].name = "0.bitmap"
+a.sfattr.list[0].parent_dir.inumber = 33685632
+a.sfattr.list[0].parent_dir.gen = 142228546
+xfs_db> dblock 0
+xfs_db> p
+magicnum = 0x424d505a
+crc = 0xc8b10abf (correct)
+owner = 33685633
+bno = 20902080
+lsn = 0x100007696
+uuid = a6575f59-1514-445e-883e-211b2c5a0f05
+rtwords[0-1011] = 0:0 1:0 2:0 3:0 4:0 5:0 6:0 7:0 8:0 9:0 10:0 11:0 12:0 13:0
+14:0 15:0 16:0 17:0 18:0 19:0 20:0 21:0xfffff800 22:0xffffffff 23:0xffffffff
+24:0xffffffff 25:0xffffffff 26:0xffffffff 27:0xffffffff 28:0xffffffff
+29:0xffffffff 30:0xffffffff 31:0xffffffff 32:0xffffffff
+...
+979:0xffffffff 980:0xffffffff 981:0xffffffff 982:0xffffffff 983:0xffffffff
+984:0xffffffff 985:0xffffffff 986:0xffffffff 987:0xffffffff 988:0xffffffff
+989:0xffffffff 990:0xffffffff 991:0xffffffff 992:0xffffffff 993:0xffffffff
+994:0xffffffff 995:0xffffffff 996:0xffffffff 997:0xffffffff 998:0xffffffff
+999:0xffffffff 1000:0xffffffff 1001:0xffffffff 1002:0xffffffff 1003:0xffffffff
+1004:0xffffffff 1005:0xffffffff 1006:0xffffffff 1007:0xffffffff 1008:0xffffffff
+1009:0xffffffff 1010:0xffffffff 1011:0xffffffff
+----
+
+From this example, we can clearly see that this is a bitmap file in the
+metadata directory tree, and that it is the bitmap file for rtgroup 3. When we
+access the first block in the bitmap file, we can clearly see the new block
+header and that the first 179 extents are allocated. The bitmap words were
+excerpted for brevity.
+
+[[Real-Time_Summary_Inode]]
+== Free Space Summary Inode
+
+The real time summary inode, +sb_rsumino+, tracks the used and free space
+accounting information for the real-time device. This file indexes the
+approximate location of each free extent on the real-time device first by
+log2(extent size) and then by the real-time bitmap block number. The size of
+the summary inode file is equal to +sb_rbmblocks+ × log2(realtime device size)
+× sizeof(+xfs_suminfo_t+). The entry for a given log2(extent size) and
+rtbitmap block number is 0 if there is no free extents of that size at that
+rtbitmap location, and positive if there are any.
+
+This data structure is not particularly space efficient, however it is a very
+fast way to provide the same data as the two free space B+trees for regular
+files since the space is preallocated and metadata maintenance is minimal.
+
+If the +XFS_SB_FEAT_INCOMPAT_METADIR+ feature is enabled, each block of the
+realtime summary file has the same header as rtbitmap file blocks. However,
+the magic number will be ``SUMY'' (0x53554D59). After the block header, the
+summary counts are encoded as be32 integers.
+
+=== xfs_db rtsummary Example
+
+This example shows a real-time summary file from a freshly populated filesystem:
+
+----
+xfs_db> path -m /rtgroups/3.summary
+xfs_db> p
+core.magic = 0x494e
+core.mode = 0100000
+core.version = 3
+core.format = 2 (extents)
+core.metatype = 6 (rtsummary)
+core.uid = 0
+core.gid = 0
+core.nlinkv2 = 1
+core.projid_lo = 3
+core.projid_hi = 0
+core.nextents = 1
+core.atime.sec = Tue Oct 15 16:04:02 2024
+core.atime.nsec = 769694000
+core.mtime.sec = Tue Oct 15 16:04:02 2024
+core.mtime.nsec = 769694000
+core.ctime.sec = Tue Oct 15 16:04:02 2024
+core.ctime.nsec = 769699000
+core.size = 4096
+core.nblocks = 1
+core.extsize = 0
+core.naextents = 0
+core.forkoff = 24
+core.aformat = 1 (local)
+core.dmevmask = 0
+core.dmstate = 0
+core.newrtbm = 0
+core.prealloc = 0
+core.realtime = 0
+core.immutable = 1
+core.append = 0
+core.sync = 1
+core.noatime = 1
+core.nodump = 1
+core.rtinherit = 0
+core.projinherit = 0
+core.nosymlinks = 0
+core.extsz = 0
+core.extszinherit = 0
+core.nodefrag = 1
+core.filestream = 0
+core.gen = 519466891
+next_unlinked = null
+v3.crc = 0x54fc58d0 (correct)
+v3.change_count = 3
+v3.lsn = 0
+v3.flags2 = 0x38
+v3.cowextsize = 0
+v3.crtime.sec = Tue Oct 15 16:04:02 2024
+v3.crtime.nsec = 769694000
+v3.inumber = 33685634
+v3.uuid = a6575f59-1514-445e-883e-211b2c5a0f05
+v3.reflink = 0
+v3.cowextsz = 0
+v3.dax = 0
+v3.bigtime = 1
+v3.nrext64 = 1
+v3.metadata = 1
+u3.bmx[0] = [startoff,startblock,blockcount,extentflag]
+0:[0,4210703,1,0]
+a.sfattr.hdr.totsize = 28
+a.sfattr.hdr.count = 1
+a.sfattr.list[0].namelen = 9
+a.sfattr.list[0].valuelen = 12
+a.sfattr.list[0].root = 0
+a.sfattr.list[0].secure = 0
+a.sfattr.list[0].parent = 1
+a.sfattr.list[0].name = "0.summary"
+a.sfattr.list[0].parent_dir.inumber = 33685632
+a.sfattr.list[0].parent_dir.gen = 142228546
+xfs_db> dblock 0
+xfs_db> p
+magicnum = 0x53554d59
+crc = 0x473340a8 (correct)
+owner = 33685634
+bno = 20902008
+lsn = 0x100007696
+uuid = a6575f59-1514-445e-883e-211b2c5a0f05
+suminfo[0-1011] = 0:0 1:0 2:0 3:0 4:0 5:0 6:0 7:0 8:0 9:0 10:0 11:0 12:0 13:0
+14:0 15:0 16:0 17:0 18:0 19:0 20:0 21:0 22:0 23:0 24:0 25:0 26:0 27:0 28:0 29:0
+30:0 31:0 32:0
+...
+618:0 619:0 620:0 621:0 622:0 623:0 624:0 625:0 626:0 627:1 628:0 629:0 630:0
+...
+979:0 980:0 981:0 982:0 983:0 984:0 985:0 986:0 987:0 988:0 989:0 990:0 991:0
+992:0 993:0 994:0 995:0 996:0 997:0 998:0 999:0 1000:0 1001:0 1002:0 1003:0
+1004:0 1005:0 1006:0 1007:0 1008:0 1009:0 1010:0 1011:0
+----
+
+From this example, we can clearly see that this is a summary file in the
+metadata directory tree, and that it is the summary file for rtgroup 3. When
+we access the first block in the summary file, we can clearly see the new block
+header and the nonzero counter for the one large free extent in this group.
+The summary counts were excerpted for brevity.
+
+[[Realtime_Groups]]
+== Realtime Groups
+
+To reduce metadata contention for space allocation and remapping activities
+being applied to realtime files, the realtime volume can be split into
+allocation groups, just like the data volume. The free space information is
+still contained in a single file that applies to the entire volume.
+
+Each realtime allocation group can contain up to (2^31^ - 1) filesystem blocks,
+regardless of the underlying realtime extent size.
+
+Each realtime group has the following characteristics:
+
+ * Group 0 has a super block describing overall filesystem info
+ * Free space bitmap
+ * Summary of free space
+
+The free space metadata are the same as described in the previous sections,
+except that their scope covers only a single rtgroup. The other structures are
+expanded upon in the following sections.
+
+[[Realtime_Group_Superblocks]]
+=== Superblocks
+
+The first block of each realtime group contains a superblock. These fields
+must match their counterparts in the filesystem superblock on the data device.
+
+[source, c]
+----
+struct xfs_rtsb {
+ __be32 rsb_magicnum;
+ __le32 rsb_crc;
+
+ __be32 rsb_pad;
+ unsigned char rsb_fname[XFSLABEL_MAX];
+
+ uuid_t rsb_uuid;
+ uuid_t rsb_meta_uuid;
+
+ /* must be padded to 64 bit alignment */
+};
+----
+
+*rsb_magicnum*::
+Identifies the filesystem. Its value is +XFS_RTSB_MAGIC+ ``Frog'' (0x46726F67).
+
+*rsb_crc*::
+Superblock checksum.
+
+*rsb_pad*::
+Must be zero.
+
+*rsb_fname[12]*::
+Name for the filesystem. This matches +sb_fname+ in the primary superblock.
+
+*rsb_uuid*::
+UUID (Universally Unique ID) for the filesystem. This matches +sb_uuid+ in the
+primary superblock.
+
+*rsb_meta_uuid*::
+Metadata UUID for the filesystem. This matches +sb_meta_uuid+ in the primary
+superblock.
+
+==== xfs_db rtgroup Superblock Example
+
+A filesystem is made on a multidisk filesystem with the following command:
+
+----
+# mkfs.xfs -r rtgroups=1,rgcount=4,rtdev=/dev/sdb /dev/sda -f
+meta-data=/dev/sda isize=512 agcount=4, agsize=1298176 blks
+ = sectsz=512 attr=2, projid32bit=1
+ = crc=1 finobt=1, sparse=1, rmapbt=1
+ = reflink=1 bigtime=1 inobtcount=1 nrext64=1
+ = metadir=1
+data = bsize=4096 blocks=5192704, imaxpct=25
+ = sunit=0 swidth=0 blks
+naming =version 2 bsize=4096 ascii-ci=0, ftype=1
+log =internal log bsize=4096 blocks=16384, version=2
+ = sectsz=512 sunit=0 blks, lazy-count=1
+realtime =/dev/sdb extsz=4096 blocks=5192704, rtextents=5192704
+ = rgcount=5 rgsize=1048576 extents
+----
+
+And in xfs_db, inspecting the realtime group superblock and then the regular
+superblock:
+
+----
+# xfs_db -R /dev/sdb /dev/sda
+xfs_db> rtsb
+xfs_db> print
+magicnum = 0x46726f67
+crc = 0x759a62d4 (correct)
+pad = 0
+fname = "\000\000\000\000\000\000\000\000\000\000\000\000"
+uuid = 7e55b909-8728-4d69-a1fa-891427314eea
+meta_uuid = 7e55b909-8728-4d69-a1fa-891427314eea
+----
+
+include::rtrmapbt.asciidoc[]
diff --git a/design/XFS_Filesystem_Structure/reconstruction.asciidoc b/design/XFS_Filesystem_Structure/reconstruction.asciidoc
index f172e0f..f4c1021 100644
--- a/design/XFS_Filesystem_Structure/reconstruction.asciidoc
+++ b/design/XFS_Filesystem_Structure/reconstruction.asciidoc
@@ -1,10 +1,6 @@
[[Reconstruction]]
= Metadata Reconstruction
-[NOTE]
-This is a theoretical discussion of how reconstruction could work; none of this
-is implemented as of 2015.
-
A simple UNIX filesystem can be thought of in terms of a directed acyclic graph.
To a first approximation, there exists a root directory node, which points to
other nodes. Those other nodes can themselves be directories or they can be
@@ -45,9 +41,14 @@ The xref:Reverse_Mapping_Btree[reverse-mapping B+tree] fills in part of the
puzzle. Since it contains copies of every entry in each inode’s data and
attribute forks, we can fix a corrupted block map with these records.
Furthermore, if the inode B+trees become corrupt, it is possible to visit all
-inode chunks using the reverse-mapping data. Should XFS ever gain the ability
-to store parent directory information in each inode, it also becomes possible
+inode chunks using the reverse-mapping data. xref:Parent_Pointers[Directory
+parent pointers] fill in the rest of the puzzle by mirroring the directory tree
+structure with parent directory information in each inode. It is now possible
to resurrect damaged directory trees, which should reduce the complaints about
inodes ending up in +/lost+found+. Everything else in the per-AG primary
-metadata can already be reconstructed via +xfs_repair+. Hopefully,
-reconstruction will not turn out to be a fool's errand.
+metadata can already be reconstructed via +xfs_repair+.
+
+See the
+https://docs.kernel.org/filesystems/xfs/xfs-online-fsck-design.html[design
+document] for online repair for a more thorough discussion of how this metadata
+are put to use.
diff --git a/design/XFS_Filesystem_Structure/refcountbt.asciidoc b/design/XFS_Filesystem_Structure/refcountbt.asciidoc
index 508a9dd..1614087 100644
--- a/design/XFS_Filesystem_Structure/refcountbt.asciidoc
+++ b/design/XFS_Filesystem_Structure/refcountbt.asciidoc
@@ -6,7 +6,7 @@ This data structure is under construction! Details may change.
To support the sharing of file data blocks (reflink), each allocation group has
its own reference count B+tree, which grows in the allocated space like the
-inode B+trees. This data could be gleaned by performing an interval query of
+inode B+trees. This data could be collected by performing an interval query of
the reverse-mapping B+tree, but doing so would come at a huge performance
penalty. Therefore, this data structure is a cache of computable information.
diff --git a/design/xfs-self-describing-metadata.asciidoc b/design/XFS_Filesystem_Structure/self_describing_metadata.asciidoc
index b7dc3ff..c79e865 100644
--- a/design/xfs-self-describing-metadata.asciidoc
+++ b/design/XFS_Filesystem_Structure/self_describing_metadata.asciidoc
@@ -1,6 +1,4 @@
-= XFS Self Describing Metadata
-Dave Chinner, <dchinner@redhat.com>
-v1.0, Feb 2014: Initial conversion to asciidoc
+= Metadata Integrity
== Introduction
@@ -32,6 +30,31 @@ up, the more likely that the cause will be lost in the noise. Hence the primary
concern for supporting PB scale filesystems is minimising the time and effort
required for basic forensic analysis of the filesystem structure.
+Therefore, the version 5 disk format introduced larger headers for all metadata
+types, which enable the filesystem to check information being read from the
+disk more rigorously. Metadata integrity fields now include:
+
+* *Magic* numbers, to classify all types of metadata. This is unchanged from v4.
+* A copy of the filesystem *UUID*, to confirm that a given disk block is connected to the superblock.
+* The *owner*, to avoid accessing a piece of metadata which belongs to some other part of the filesystem.
+* The filesystem *block number*, to detect misplaced writes.
+* The *log serial number* of the last write to this block, to avoid replaying obsolete log entries.
+* A CRC32c *checksum* of the entire block, to detect minor corruption.
+
+Metadata integrity coverage has been extended to all metadata blocks in the
+filesystem, with the following notes:
+
+* Inodes can have multiple ``owners'' in the directory tree; therefore the record contains the inode number instead of an owner or a block number.
+* Superblocks have no owners.
+* The disk quota file has no owner or block numbers.
+* Metadata owned by files list the inode number as the owner.
+* Per-AG data and B+tree blocks list the AG number as the owner.
+* Per-AG header sectors don't list owners or block numbers, since they have fixed locations.
+* Remote attribute blocks are not logged and therefore the LSN must be -1.
+
+This functionality enables XFS to decide that a block contents are so
+unexpected that it should stop immediately. Unfortunately checksums do not
+allow for automatic correction. Please keep regular backups, as always.
== Self Describing Metadata
diff --git a/design/XFS_Filesystem_Structure/superblock.asciidoc b/design/XFS_Filesystem_Structure/superblock.asciidoc
new file mode 100644
index 0000000..f045530
--- /dev/null
+++ b/design/XFS_Filesystem_Structure/superblock.asciidoc
@@ -0,0 +1,574 @@
+[[Superblocks]]
+== Superblocks
+
+Each AG starts with a superblock. The first one, in AG 0, is the primary
+superblock which stores aggregate AG information. Secondary superblocks are
+only used by xfs_repair when the primary superblock has been corrupted. A
+superblock is one sector in length.
+
+The superblock is defined by the following structure. The description of each
+field follows.
+
+[source, c]
+----
+struct xfs_dsb {
+ __be32 sb_magicnum;
+ __be32 sb_blocksize;
+ __be64 sb_dblocks;
+ __be64 sb_rblocks;
+ __be64 sb_rextents;
+ uuid_t sb_uuid;
+ __be64 sb_logstart;
+ __be64 sb_rootino;
+ __be64 sb_rbmino;
+ __be64 sb_rsumino;
+ __be32 sb_rextsize;
+ __be32 sb_agblocks;
+ __be32 sb_agcount;
+ __be32 sb_rbmblocks;
+ __be32 sb_logblocks;
+ __be16 sb_versionnum;
+ __be16 sb_sectsize;
+ __be16 sb_inodesize;
+ __be16 sb_inopblock;
+ char sb_fname[XFSLABEL_MAX];
+ __u8 sb_blocklog;
+ __u8 sb_sectlog;
+ __u8 sb_inodelog;
+ __u8 sb_inopblog;
+ __u8 sb_agblklog;
+ __u8 sb_rextslog;
+ __u8 sb_inprogress;
+ __u8 sb_imax_pct;
+ __be64 sb_icount;
+ __be64 sb_ifree;
+ __be64 sb_fdblocks;
+ __be64 sb_frextents;
+ __be64 sb_uquotino;
+ __be64 sb_gquotino;
+ __be16 sb_qflags;
+ __u8 sb_flags;
+ __u8 sb_shared_vn;
+ __be32 sb_inoalignmt;
+ __be32 sb_unit;
+ __be32 sb_width;
+ __u8 sb_dirblklog;
+ __u8 sb_logsectlog;
+ __be16 sb_logsectsize;
+ __be32 sb_logsunit;
+ __be32 sb_features2;
+ __be32 sb_bad_features2;
+
+ /* version 5 superblock fields start here */
+ __be32 sb_features_compat;
+ __be32 sb_features_ro_compat;
+ __be32 sb_features_incompat;
+ __be32 sb_features_log_incompat;
+ __le32 sb_crc;
+ __be32 sb_spino_align;
+ __be64 sb_pquotino;
+ __be64 sb_lsn;
+ uuid_t sb_meta_uuid;
+ __be64 sb_metadirino;
+ __be32 sb_rgcount;
+ __be32 sb_rgextents;
+ __u8 sb_rgblklog;
+ __u8 sb_pad[7];
+
+ /* must be padded to 64 bit alignment */
+};
+----
+*sb_magicnum*::
+Identifies the filesystem. Its value is +XFS_SB_MAGIC+ ``XFSB'' (0x58465342).
+
+*sb_blocksize*::
+The size of a basic unit of space allocation in bytes. Typically, this is 4096
+(4KB) but can range from 512 to 65536 bytes.
+
+*sb_dblocks*::
+Total number of blocks available for data and metadata on the filesystem.
+
+*sb_rblocks*::
+Number blocks in the real-time disk device. Refer to
+xref:Real-time_Devices[real-time sub-volumes] for more information.
+
+*sb_rextents*::
+Number of extents on the real-time device.
+
+*sb_uuid*::
+UUID (Universally Unique ID) for the filesystem. Filesystems can be mounted by
+the UUID instead of device name.
+
+*sb_logstart*::
+First block number for the journaling log if the log is internal (ie. not on a
+separate disk device). For an external log device, this will be zero (the log
+will also start on the first block on the log device). The identity of the log
+devices is not recorded in the filesystem, but the UUIDs of the filesystem and
+the log device are compared to prevent corruption.
+
+*sb_rootino*::
+Root inode number for the filesystem. Normally, the root inode is at the
+start of the first possible inode chunk in AG 0. This is 128 when using a 4KB
+block size.
+
+*sb_rbmino*::
+Bitmap inode for real-time extents.
+
+*sb_rsumino*::
+Summary inode for real-time bitmap.
+
+*sb_rextsize*::
+Realtime extent size in blocks.
+
+*sb_agblocks*::
+Size of each AG in blocks. For the actual size of the last AG, refer to the
+xref:AG_Free_Space_Management[free space] +agf_length+ value.
+
+*sb_agcount*::
+Number of AGs in the filesystem.
+
+*sb_rbmblocks*::
+Number of real-time bitmap blocks.
+
+*sb_logblocks*::
+Number of blocks for the journaling log.
+
+*sb_versionnum*::
+Filesystem version number. This is a bitmask specifying the features enabled
+when creating the filesystem. Any disk checking tools or drivers that do not
+recognize any set bits must not operate upon the filesystem. Most of the flags
+indicate features introduced over time. If the value of the lower nibble is >=
+4, the higher bits indicate feature flags as follows:
+
+.Version 4 Superblock version flags
+[options="header"]
+|=====
+| Flag | Description
+| +XFS_SB_VERSION_ATTRBIT+ |
+Set if any inode have extended attributes. If this bit is set; the
++XFS_SB_VERSION2_ATTR2BIT+ is not set; and the +attr2+ mount flag is not
+specified, the +di_forkoff+ inode field will not be dynamically adjusted.
+See the section about xref:Extended_Attribute_Versions[extended attribute
+versions] for more information.
+
+| +XFS_SB_VERSION_NLINKBIT+ | Set if any inodes use 32-bit di_nlink values.
+| +XFS_SB_VERSION_QUOTABIT+ |
+Quotas are enabled on the filesystem. This
+also brings in the various quota fields in the superblock.
+
+| +XFS_SB_VERSION_ALIGNBIT+ | Set if sb_inoalignmt is used.
+| +XFS_SB_VERSION_DALIGNBIT+ | Set if sb_unit and sb_width are used.
+| +XFS_SB_VERSION_SHAREDBIT+ | Set if sb_shared_vn is used.
+| +XFS_SB_VERSION_LOGV2BIT+ | Version 2 journaling logs are used.
+| +XFS_SB_VERSION_SECTORBIT+ | Set if sb_sectsize is not 512.
+| +XFS_SB_VERSION_EXTFLGBIT+ | Unwritten extents are used. This is always set.
+| +XFS_SB_VERSION_DIRV2BIT+ |
+Version 2 directories are used. This is always set.
+
+| +XFS_SB_VERSION_MOREBITSBIT+ |
+Set if the sb_features2 field in the superblock contains more flags.
+|=====
+
+If the lower nibble of this value is 5, then this is a v5 filesystem; the
++XFS_SB_VERSION2_CRCBIT+ feature must be set in +sb_features2+.
+
+*sb_sectsize*::
+Specifies the underlying disk sector size in bytes. Typically this is 512 or
+4096 bytes. This determines the minimum I/O alignment, especially for direct I/O.
+
+*sb_inodesize*::
+Size of the inode in bytes. The default is 256 (2 inodes per standard sector)
+but can be made as large as 2048 bytes when creating the filesystem. On a v5
+filesystem, the default and minimum inode size are both 512 bytes.
+
+*sb_inopblock*::
+Number of inodes per block. This is equivalent to +sb_blocksize / sb_inodesize+.
+
+*sb_fname[12]*::
+Name for the filesystem. This value can be used in the mount command.
+
+*sb_blocklog*::
+log~2~ value of +sb_blocksize+. In other terms, +sb_blocksize = 2^sb_blocklog^+.
+
+*sb_sectlog*::
+log~2~ value of +sb_sectsize+.
+
+*sb_inodelog*::
+log~2~ value of +sb_inodesize+.
+
+*sb_inopblog*::
+log~2~ value of +sb_inopblock+.
+
+*sb_agblklog*::
+log~2~ value of +sb_agblocks+ (rounded up). This value is used to generate inode
+numbers and absolute block numbers defined in extent maps.
+
+*sb_rextslog*::
+log~2~ value of +sb_rextents+.
+
+*sb_inprogress*::
+Flag specifying that the filesystem is being created.
+
+*sb_imax_pct*::
+Maximum percentage of filesystem space that can be used for inodes. The default
+value is 5%.
+
+*sb_icount*::
+Global count for number inodes allocated on the filesystem. This is only
+maintained in the first superblock.
+
+*sb_ifree*::
+Global count of free inodes on the filesystem. This is only maintained in the
+first superblock.
+
+*sb_fdblocks*::
+Global count of free data blocks on the filesystem. This is only maintained in
+the first superblock.
+
+*sb_frextents*::
+Global count of free real-time extents on the filesystem. This is only
+maintained in the first superblock.
+
+*sb_uquotino*::
+Inode for user quotas. This and the following two quota fields only apply if
++XFS_SB_VERSION_QUOTABIT+ flag is set in +sb_versionnum+. Refer to
+xref:Quota_Inodes[quota inodes] for more information.
+
+*sb_gquotino*::
+Inode for group or project quotas. Group and project quotas cannot be used at
+the same time on v4 filesystems. On a v5 filesystem, this inode always stores
+group quota information.
+
+*sb_qflags*::
+Quota flags. It can be a combination of the following flags:
+
+.Superblock quota flags
+[options="header"]
+|=====
+| Flag | Description
+| +XFS_UQUOTA_ACCT+ | User quota accounting is enabled.
+| +XFS_UQUOTA_ENFD+ | User quotas are enforced.
+| +XFS_UQUOTA_CHKD+ | User quotas have been checked.
+| +XFS_PQUOTA_ACCT+ | Project quota accounting is enabled.
+| +XFS_OQUOTA_ENFD+ | Other (group/project) quotas are enforced.
+| +XFS_OQUOTA_CHKD+ | Other (group/project) quotas have been checked.
+| +XFS_GQUOTA_ACCT+ | Group quota accounting is enabled.
+| +XFS_GQUOTA_ENFD+ | Group quotas are enforced.
+| +XFS_GQUOTA_CHKD+ | Group quotas have been checked.
+| +XFS_PQUOTA_ENFD+ | Project quotas are enforced.
+| +XFS_PQUOTA_CHKD+ | Project quotas have been checked.
+|=====
+
+If the +XFS_SB_FEAT_INCOMPAT_METADIR+ feature is enabled, the +sb_qflags+ field
+will persist across mounts if no quota mount options are provided.
+
+*sb_flags*::
+Miscellaneous flags.
+
+.Superblock flags
+[options="header"]
+|=====
+| Flag | Description
+| +XFS_SBF_READONLY+ | Only read-only mounts allowed.
+|=====
+
+*sb_shared_vn*::
+Reserved and must be zero (``vn'' stands for version number).
+
+*sb_inoalignmt*::
+Inode chunk alignment in fsblocks. Prior to v5, the default value provided for
+inode chunks to have an 8KiB alignment. Starting with v5, the default value
+scales with the multiple of the inode size over 256 bytes. Concretely, this
+means an alignment of 16KiB for 512-byte inodes, 32KiB for 1024-byte inodes,
+etc. If sparse inodes are enabled, the +ir_startino+ field of each inode
+B+tree record must be aligned to this block granularity, even if the inode
+given by +ir_startino+ itself is sparse.
+
+*sb_unit*::
+Underlying stripe or raid unit in blocks.
+
+*sb_width*::
+Underlying stripe or raid width in blocks.
+
+*sb_dirblklog*::
+log~2~ multiplier that determines the granularity of directory block allocations
+in fsblocks.
+
+*sb_logsectlog*::
+log~2~ value of the log subvolume's sector size. This is only used if the
+journaling log is on a separate disk device (i.e. not internal).
+
+*sb_logsectsize*::
+The log's sector size in bytes if the filesystem uses an external log device.
+
+*sb_logsunit*::
+The log device's stripe or raid unit size. This only applies to version 2 logs
++XFS_SB_VERSION_LOGV2BIT+ is set in +sb_versionnum+.
+
+*sb_features2*::
+Additional version flags if +XFS_SB_VERSION_MOREBITSBIT+ is set in
++sb_versionnum+. The currently defined additional features include:
+
+.Extended Version 4 Superblock flags
+[options="header"]
+|=====
+| Flag | Description
+| +XFS_SB_VERSION2_LAZYSBCOUNTBIT+ |
+Lazy global counters. Making a filesystem with this bit set can improve
+performance. The global free space and inode counts are only updated in the
+primary superblock when the filesystem is cleanly unmounted.
+
+| +XFS_SB_VERSION2_ATTR2BIT+ |
+Extended attributes version 2. Making a filesystem with this optimises the
+inode layout of extended attributes. If this bit is set and the +noattr2+
+mount flag is not specified, the +di_forkoff+ inode field will be dynamically
+adjusted. See the section about xref:Extended_Attribute_Versions[extended
+attribute versions] for more information.
+
+| +XFS_SB_VERSION2_PARENTBIT+ |
+Parent pointers. All inodes must have an extended attribute that points back to
+its parent inode. The primary purpose for this information is in backup systems.
+
+| +XFS_SB_VERSION2_PROJID32BIT+ |
+32-bit Project ID. Inodes can be associated with a project ID number, which
+can be used to enforce disk space usage quotas for a particular group of
+directories. This flag indicates that project IDs can be 32 bits in size.
+
+| +XFS_SB_VERSION2_CRCBIT+ |
+Metadata checksumming. All metadata blocks have an extended header containing
+the block checksum, a copy of the metadata UUID, the log sequence number of the
+last update to prevent stale replays, and a back pointer to the owner of the
+block. This feature must be and can only be set if the lowest nibble of
++sb_versionnum+ is set to 5.
+
+| +XFS_SB_VERSION2_FTYPE+ |
+Directory file type. Each directory entry records the type of the inode to
+which the entry points. This speeds up directory iteration by removing the
+need to load every inode into memory.
+|=====
+
+*sb_bad_features2*::
+This field mirrors +sb_features2+, due to past 64-bit alignment errors.
+
+*sb_features_compat*::
+Read-write compatible feature flags. The kernel can still read and write this
+FS even if it doesn't understand the flag. Currently, there are no valid
+flags.
+
+*sb_features_ro_compat*::
+Read-only compatible feature flags. The kernel can still read this FS even if
+it doesn't understand the flag.
+
+.Extended Version 5 Superblock Read-Only compatibility flags
+[options="header"]
+|=====
+| Flag | Description
+| +XFS_SB_FEAT_RO_COMPAT_FINOBT+ |
+Free inode B+tree. Each allocation group contains a B+tree to track inode chunks
+containing free inodes. This is a performance optimization to reduce the time
+required to allocate inodes.
+
+| +XFS_SB_FEAT_RO_COMPAT_RMAPBT+ |
+Reverse mapping B+tree. Each allocation group contains a B+tree containing
+records mapping AG blocks to their owners. See the section about
+xref:Reconstruction[reconstruction] for more details.
+
+| +XFS_SB_FEAT_RO_COMPAT_REFLINK+ |
+Reference count B+tree. Each allocation group contains a B+tree to track the
+reference counts of AG blocks. This enables files to share data blocks safely.
+See the section about xref:Reflink_Deduplication[reflink and deduplication] for
+more details.
+
+| +XFS_SB_FEAT_RO_COMPAT_INOBTCNT+ |
+Inode B+tree block counters. Each allocation group's inode (AGI) header
+tracks the number of blocks in each of the inode B+trees. This allows us
+to have a slightly higher level of redundancy over the shape of the inode
+btrees, and decreases the amount of time to compute the metadata B+tree
+preallocations at mount time.
+
+|=====
+
+*sb_features_incompat*::
+Read-write incompatible feature flags. The kernel cannot read or write this
+FS if it doesn't understand the flag.
+
+.Extended Version 5 Superblock Read-Write incompatibility flags
+[options="header"]
+|=====
+| Flag | Description
+| +XFS_SB_FEAT_INCOMPAT_FTYPE+ |
+Directory file type. Each directory entry tracks the type of the inode to
+which the entry points. This is a performance optimization to remove the need
+to load every inode into memory to iterate a directory.
+
+| +XFS_SB_FEAT_INCOMPAT_SPINODES+ |
+Sparse inodes. This feature relaxes the requirement to allocate inodes in
+chunks of 64. When the free space is heavily fragmented, there might exist
+plenty of free space but not enough contiguous free space to allocate a new
+inode chunk. With this feature, the user can continue to create files until
+all free space is exhausted.
+
+Unused space in the inode B+tree records are used to track which parts of the
+inode chunk are not inodes.
+
+See the chapter on xref:Sparse_Inodes[Sparse Inodes] for more information.
+
+| +XFS_SB_FEAT_INCOMPAT_META_UUID+ |
+Metadata UUID. The UUID stamped into each metadata block must match the value
+in +sb_meta_uuid+. This enables the administrator to change +sb_uuid+ at will
+without having to rewrite the entire filesystem.
+
+| +XFS_SB_FEAT_INCOMPAT_BIGTIME+ |
+Large timestamps. Inode timestamps and quota expiration timers are extended to
+support times through the year 2486. See the section on
+xref:Timestamps[timestamps] for more information.
+
+| +XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR+ |
+The filesystem is not in operable condition, and must be run through
+xfs_repair before it can be mounted.
+
+| +XFS_SB_FEAT_INCOMPAT_NREXT64+ |
+Large file fork extent counts. This greatly expands the maximum number of
+space mappings allowed in data and extended attribute file forks.
+
+| +XFS_SB_FEAT_INCOMPAT_EXCHRANGE+ |
+Atomic file mapping exchanges. The filesystem is capable of exchanging a range
+of mappings between two arbitrary ranges of a file's fork by using log intent
+items to track the progress of the high level exchange operation. In other
+words, the exchange operation can be restarted if the system goes down, which
+is necessary for userspace to commit of new file contents atomically. This
+flag has user-visible impacts, which is why it is a permanent incompat flag.
+See the section about xref:XMI_Log_Item[mapping exchange log intents] for more
+information.
+
+| +XFS_SB_FEAT_INCOMPAT_PARENT+ |
+Directory parent pointers. See the section about xref:Parent_Pointers[parent
+pointers] for more information.
+
+| +XFS_SB_FEAT_INCOMPAT_METADIR+ |
+Metadata directory tree. See the section about the xref:Metadata_Directories[
+metadata directory tree] for more information.
+
+|=====
+
+*sb_features_log_incompat*::
+Read-write incompatible feature flags for the log. The kernel cannot recover
+the FS log if it doesn't understand the flag.
+
+.Extended Version 5 Superblock Log incompatibility flags
+[options="header"]
+|=====
+| Flag | Description
+| +XFS_SB_FEAT_INCOMPAT_LOG_XATTRS+ |
+Extended attribute updates have been committed to the ondisk log.
+
+|=====
+
+*sb_crc*::
+Superblock checksum.
+
+*sb_spino_align*::
+Sparse inode alignment, in fsblocks. Each chunk of inodes referenced by a
+sparse inode B+tree record must be aligned to this block granularity.
+
+*sb_pquotino*::
+Project quota inode.
+
+*sb_lsn*::
+Log sequence number of the last superblock update.
+
+*sb_meta_uuid*::
+If the +XFS_SB_FEAT_INCOMPAT_META_UUID+ feature is set, then the UUID field in
+all metadata blocks must match this UUID. If not, the block header UUID field
+must match +sb_uuid+.
+
+*sb_metadirino*::
+If the +XFS_SB_FEAT_RO_INCOMPAT_METADIR+ feature is set, this field points to
+the inode of the root directory of the metadata directory tree.
+This field is zero otherwise.
+
+*sb_rgcount*::
+Count of realtime groups in the filesystem, if the
++XFS_SB_FEAT_RO_INCOMPAT_METADIR+ feature is enabled. If no realtime subvolume
+exists, this value will be zero.
+
+*sb_rgextents*::
+Maximum number of realtime extents that can be contained within a realtime
+group, if the +XFS_SB_FEAT_RO_INCOMPAT_METADIR+ feature is enabled.
+
+*sb_rgblklog*::
+If the +XFS_SB_FEAT_RO_INCOMPAT_METADIR+ feature is enabled, this is the log~2~
+value of +sb_rgextents+ * +sb_rextsize+ (rounded up). This value is used to
+generate absolute block numbers defined in extent maps from the segmented
++xfs_rtblock_t+ values.
+
+*sb_pad[7]*::
+Zeroes, if the +XFS_SB_FEAT_RO_INCOMPAT_METADIR+ feature is enabled.
+
+=== xfs_db Superblock Example
+
+A filesystem is made on a single disk with the following command:
+
+----
+# mkfs.xfs -i attr=2 -n size=16384 -f /dev/sda7
+meta-data=/dev/sda7 isize=256 agcount=16, agsize=3923122 blks
+ = sectsz=512 attr=2
+data = bsize=4096 blocks=62769952, imaxpct=25
+ = sunit=0 swidth=0 blks, unwritten=1
+naming =version 2 bsize=16384
+log =internal log bsize=4096 blocks=30649, version=1
+ = sectsz=512 sunit=0 blks
+realtime =none extsz=65536 blocks=0, rtextents=0
+----
+
+And in xfs_db, inspecting the superblock:
+
+----
+xfs_db> sb
+xfs_db> p
+magicnum = 0x58465342
+blocksize = 4096
+dblocks = 62769952
+rblocks = 0
+rextents = 0
+uuid = 32b24036-6931-45b4-b68c-cd5e7d9a1ca5
+logstart = 33554436
+rootino = 128
+rbmino = 129
+rsumino = 130
+rextsize = 16
+agblocks = 3923122
+agcount = 16
+rbmblocks = 0
+logblocks = 30649
+versionnum = 0xb084
+sectsize = 512
+inodesize = 256
+inopblock = 16
+fname = "\000\000\000\000\000\000\000\000\000\000\000\000"
+blocklog = 12
+sectlog = 9
+inodelog = 8
+inopblog = 4
+agblklog = 22
+rextslog = 0
+inprogress = 0
+imax_pct = 25
+icount = 64
+ifree = 61
+fdblocks = 62739235
+frextents = 0
+uquotino = 0
+gquotino = 0
+qflags = 0
+flags = 0
+shared_vn = 0
+inoalignmt = 2
+unit = 0
+width = 0
+dirblklog = 2
+logsectlog = 0
+logsectsize = 0
+logsunit = 0
+features2 = 8
+----
diff --git a/design/XFS_Filesystem_Structure/timestamps.asciidoc b/design/XFS_Filesystem_Structure/timestamps.asciidoc
new file mode 100644
index 0000000..56d4dc9
--- /dev/null
+++ b/design/XFS_Filesystem_Structure/timestamps.asciidoc
@@ -0,0 +1,65 @@
+[[Timestamps]]
+= Timestamps
+
+XFS needs to be able to persist the concept of a point in time. This chapter
+discusses how timestamps are represented on disk.
+
+[[Inode_Timestamps]]
+== Inode Timestamps
+
+The filesystem preserves up to four different timestamps for each file stored
+in the filesystem. These quantities are: the time when the file was created
+(+di_crtime+), the last time the file metadata were changed (+di_ctime+), the
+last time the file contents were changed (+di_mtime+), and the last time the
+file contents were accessed (+di_atime+). The filesystem epoch is aligned with
+the Unix epoch, which is to say that a value of all zeroes represents 00:00:00
+UTC on January 1st, 1970.
+
+Prior to the introduction of the bigtime feature, inode timestamps were
+laid out as as segmented counter of seconds and nanoseconds:
+
+[source, c]
+----
+struct xfs_legacy_timestamp {
+ __int32_t t_sec;
+ __int32_t t_nsec;
+};
+----
+
+The smallest date this format can represent is 20:45:52 UTC on December 13st,
+1901, and the largest date supported is 03:14:07 UTC on January 19, 2038.
+
+With the introduction of the bigtime feature, the format is changed to
+interpret the timestamp as a 64-bit count of nanoseconds since the smallest
+date supported by the old encoding. This means that the smallest date
+supported is still 20:45:52 UTC on December 13st, 1901; but now the largest
+date supported is 20:20:24 UTC on July 2nd, 2486.
+
+[[Quota_Timers]]
+== Quota Grace Period Expiration Timers
+
+XFS' quota control allows administrators to set a soft limit on each type of
+resource that a regular user can consume: inodes, blocks, and realtime blocks.
+The administrator can establish a grace period after which the soft limit
+becomes a hard limit for the user. Therefore, XFS needs to be able to store
+the exact time when a grace period expires.
+
+Prior to the introduction of the bigtime feature, quota grace period
+expirations were unsigned 32-bit seconds counters, with the magic value zero
+meaning that the soft limit has not been exceeded. Therefore, the smallest
+expiration date that can be expressed is 00:00:01 UTC on January 1st, 1970; and
+the largest is 06:28:15 on February 7th, 2106.
+
+With the introduction of the bigtime feature, the ondisk field now encodes the
+upper 32 bits of an unsigned 34-bit seconds counter. Zero is still a magic
+value that means the soft limit has not been exceeded. The smallest quota
+expiration date is now 00:00:04 UTC on January 1st, 1970; and the largest is
+20:20:24 UTC on July 2nd, 2486. The format can encode slightly larger
+expiration dates, but it was decided to end support for both timers at exactly
+the same point.
+
+The default grace periods are stored in the timer fields of the quota record
+for id zero. Since this quantity is an interval, these fields are always
+interpreted as an unsigned 32 bit quantity. Therefore, the longest possible
+grace period is approximately 136 years, 29 weeks, 3 days, 6 hours, 28 minutes
+and 15 seconds.
diff --git a/design/XFS_Filesystem_Structure/xfs_filesystem_structure.asciidoc b/design/XFS_Filesystem_Structure/xfs_filesystem_structure.asciidoc
index 5dba1c7..a643d18 100644
--- a/design/XFS_Filesystem_Structure/xfs_filesystem_structure.asciidoc
+++ b/design/XFS_Filesystem_Structure/xfs_filesystem_structure.asciidoc
@@ -6,8 +6,8 @@
// and we really need an area to set up various docbook sections like copyright
// and legal notice sections.
//
-XFS Filesystem Disk Structures
-==============================
+XFS Algorithms & Data Structures
+================================
:doctype: book
:docinfo1:
@@ -46,7 +46,9 @@ log items which are formatted in host order.
include::overview.asciidoc[]
-include::metadata_integrity.asciidoc[]
+include::self_describing_metadata.asciidoc[]
+
+include::delayed_logging.asciidoc[]
include::reflink.asciidoc[]
@@ -70,6 +72,8 @@ include::btrees.asciidoc[]
include::dabtrees.asciidoc[]
+include::timestamps.asciidoc[]
+
include::allocation_groups.asciidoc[]
include::rmapbt.asciidoc[]
@@ -80,6 +84,10 @@ include::journaling_log.asciidoc[]
include::internal_inodes.asciidoc[]
+include::realtime.asciidoc[]
+
+include::fs_properties.asciidoc[]
+
:leveloffset: 0
Dynamically Allocated Structures
diff --git a/design/xfs-smr-structure.asciidoc b/design/xfs-smr-structure.asciidoc
index dd959ab..b970224 100644
--- a/design/xfs-smr-structure.asciidoc
+++ b/design/xfs-smr-structure.asciidoc
@@ -67,7 +67,7 @@ next to the metadata zone, but typically metadata writes are not correlated with
log writes.
Hence the only real functionality we need to add to the log is the tail pushing
-modificaitons to move the tail into the same zone as the head, as well as being
+modifications to move the tail into the same zone as the head, as well as being
able to trigger and block on zone write pointer reset operations.
The log doesn't actually need to track the zone write pointer, though log
@@ -90,7 +90,7 @@ packed extent allocation only) to ensure that newly written blocks are allocated
in a sane manner.
We're going to need userspace to be able to see the contents of these inodes;
-read only access wil be needed to analyse the contents of the zone, so we're
+read only access will be needed to analyse the contents of the zone, so we're
going to need a special directory to expose this information. It would be useful
to have a ".zones" directory hanging off the root directory that contains all
the zone allocation inodes so userspace can simply open them.
@@ -112,14 +112,14 @@ also have other benefits...
While it seems like tracking free space is trivial for the purposes of
allocation (and it is!), the complexity comes when we start to delete or
overwrite data. Suddenly zones no longer contain contiguous ranges of valid
-data; they have "freed" extents in the middle of them that contian stale data.
+data; they have "freed" extents in the middle of them that contain stale data.
We can't use that "stale space" until the entire zone is made up of "stale"
extents. Hence we need a Cleaner.
=== Zone Cleaner
The purpose of the cleaner is to find zones that are mostly stale space and
-consolidate the remaining referenced data into a new, contigious zone, enabling
+consolidate the remaining referenced data into a new, contiguous zone, enabling
us to then "clean" the stale zone and make it available for writing new data
again.
@@ -129,7 +129,7 @@ parent pointer functionality. This gives us the mechanism by which we can
quickly re-organise files that have extents in zones that need cleaning.
The key word here is "reorganise". We have a tool that already reorganises file
-layout: xfs_fsr. The "Cleaner" is a finely targetted policy for xfs_fsr -
+layout: xfs_fsr. The "Cleaner" is a finely targeted policy for xfs_fsr -
instead of trying to minimise fixpel fragments, it finds zones that need
cleaning by reading their summary info from the /.zones/ directory and analysing
the free bitmap state if there is a high enough percentage of stale blocks. From
@@ -200,7 +200,7 @@ random write space for all our metadata......
A basic guideline is that for 4k blocks and zones of 256MB, we'll need 8kB of
bitmap space and two inodes, so call it 10kB per 256MB zone. That's 40MB per TB
-for free space bitmaps. We'll want to suport at least 1 million inodes per TB,
+for free space bitmaps. We'll want to support at least 1 million inodes per TB,
so that's another 512MB per TB, plus another 256MB per TB for directory
structures. There's other bits and pieces of metadata as well (attribute space,
internal freespace btrees, reverse map btrees, etc.
@@ -316,7 +316,7 @@ spiral.
I suspect the best we will be able to do with fallocate based preallocation is
to mark the region as delayed allocation.
-=== Allocation Alignemnt
+=== Allocation Alignment
With zone based write pointers, we lose all capability of write alignment to the
underlying storage - our only choice to write is the current set of write