From: David Howells <dhowells@redhat.com>

The attached patch creates a facility by which a filesystem, character
device or block device can detect a page mapped to an inode is about to
become writable.  This provides the facility at two levels: a vma operation
and an address space operation.  This can be used by a netfs to synchronise
with a cache writing the page to disc.

Signed-Off-By: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 25-akpm/include/linux/fs.h |    3 +
 25-akpm/include/linux/mm.h |    4 +
 25-akpm/mm/filemap.c       |   22 +++++++++-
 25-akpm/mm/memory.c        |   97 ++++++++++++++++++++++++++++++++++-----------
 4 files changed, 102 insertions(+), 24 deletions(-)

diff -puN include/linux/fs.h~add-page-becoming-writable-notification include/linux/fs.h
--- 25/include/linux/fs.h~add-page-becoming-writable-notification	2004-11-15 20:25:05.148980920 -0800
+++ 25-akpm/include/linux/fs.h	2004-11-15 20:25:05.158979400 -0800
@@ -333,6 +333,9 @@ struct address_space_operations {
 	int (*releasepage) (struct page *, int);
 	ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
 			loff_t offset, unsigned long nr_segs);
+
+	/* notification that a page is about to become writable */
+	int (*page_mkwrite)(struct page *page);
 };
 
 struct backing_dev_info;
diff -puN include/linux/mm.h~add-page-becoming-writable-notification include/linux/mm.h
--- 25/include/linux/mm.h~add-page-becoming-writable-notification	2004-11-15 20:25:05.150980616 -0800
+++ 25-akpm/include/linux/mm.h	2004-11-15 20:25:05.159979248 -0800
@@ -208,6 +208,10 @@ struct vm_operations_struct {
 	void (*close)(struct vm_area_struct * area);
 	struct page * (*nopage)(struct vm_area_struct * area, unsigned long address, int *type);
 	int (*populate)(struct vm_area_struct * area, unsigned long address, unsigned long len, pgprot_t prot, unsigned long pgoff, int nonblock);
+
+	/* notification that a previously read-only page is about to become
+	 * writable, if an error is returned it will cause a SIGBUS */
+	int (*page_mkwrite)(struct vm_area_struct *vma, struct page *page);
 #ifdef CONFIG_NUMA
 	int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new);
 	struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
diff -puN mm/filemap.c~add-page-becoming-writable-notification mm/filemap.c
--- 25/mm/filemap.c~add-page-becoming-writable-notification	2004-11-15 20:25:05.152980312 -0800
+++ 25-akpm/mm/filemap.c	2004-11-15 20:25:05.161978944 -0800
@@ -1503,6 +1503,13 @@ generic_file_get_policy(struct vm_area_s
 }
 #endif
 
+/*
+ * pass notification that a page is becoming writable up to the filesystem
+ */
+static int filemap_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+{
+	return page->mapping->a_ops->page_mkwrite(page);
+}
 
 struct vm_operations_struct generic_file_vm_ops = {
 	.nopage		= filemap_nopage,
@@ -1513,6 +1520,16 @@ struct vm_operations_struct generic_file
 #endif
 };
 
+struct vm_operations_struct generic_file_vm_mkwr_ops = {
+	.nopage		= filemap_nopage,
+	.populate	= filemap_populate,
+	.page_mkwrite	= filemap_page_mkwrite,
+#ifdef CONFIG_NUMA
+	.set_policy     = generic_file_set_policy,
+	.get_policy     = generic_file_get_policy,
+#endif
+};
+
 /* This is used for a general mmap of a disk file */
 
 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
@@ -1522,7 +1539,10 @@ int generic_file_mmap(struct file * file
 	if (!mapping->a_ops->readpage)
 		return -ENOEXEC;
 	file_accessed(file);
-	vma->vm_ops = &generic_file_vm_ops;
+	if (!mapping->a_ops->page_mkwrite)
+		vma->vm_ops = &generic_file_vm_ops;
+	else
+		vma->vm_ops = &generic_file_vm_mkwr_ops;
 	return 0;
 }
 
diff -puN mm/memory.c~add-page-becoming-writable-notification mm/memory.c
--- 25/mm/memory.c~add-page-becoming-writable-notification	2004-11-15 20:25:05.153980160 -0800
+++ 25-akpm/mm/memory.c	2004-11-15 20:25:05.163978640 -0800
@@ -1265,6 +1265,54 @@ static inline void break_cow(struct vm_a
 }
 
 /*
+ * Make a PTE writeable for do_wp_page() on a shared-writable page
+ */
+static inline int do_wp_page_mk_pte_writable(struct mm_struct *mm,
+					     struct vm_area_struct *vma,
+					     unsigned long address,
+					     pte_t *page_table,
+					     struct page *old_page,
+					     pte_t pte)
+{
+	pte_t entry;
+
+	/* See if the VMA's owner wants to know that the page is about to
+	 * become writable */
+	if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
+		/* Notify the page owner without the lock held so they can
+		 * sleep if they want to */
+		spin_unlock(&mm->page_table_lock);
+
+		if (vma->vm_ops->page_mkwrite(vma, old_page) < 0)
+			goto bus_error;
+
+		spin_lock(&mm->page_table_lock);
+
+		/* Since we dropped the lock we need to revalidate the PTE as
+		 * someone else may have changed it. If they did, we just
+		 * return, as we can count on the MMU to tell us if they didn't
+		 * also make it writable
+		 */
+		if (!pte_same(*page_table, pte))
+			goto minor_fault;
+	}
+
+	flush_cache_page(vma, address);
+	entry = maybe_mkwrite(pte_mkyoung(pte_mkdirty(pte)),
+			      vma);
+	ptep_set_access_flags(vma, address, page_table, entry, 1);
+	update_mmu_cache(vma, address, entry);
+	pte_unmap(page_table);
+
+ minor_fault:
+	spin_unlock(&mm->page_table_lock);
+	return VM_FAULT_MINOR;
+
+ bus_error:
+	return VM_FAULT_SIGBUS;
+}
+
+/*
  * This routine handles present pages, when users try to write
  * to a shared page. It is done by copying the page to a new address
  * and decrementing the shared-page counter for the old page.
@@ -1286,7 +1334,6 @@ static int do_wp_page(struct mm_struct *
 {
 	struct page *old_page, *new_page;
 	unsigned long pfn = pte_pfn(pte);
-	pte_t entry;
 
 	if (unlikely(!pfn_valid(pfn))) {
 		/*
@@ -1305,17 +1352,11 @@ static int do_wp_page(struct mm_struct *
 	if (!TestSetPageLocked(old_page)) {
 		int reuse = can_share_swap_page(old_page);
 		unlock_page(old_page);
-		if (reuse) {
-			flush_cache_page(vma, address);
-			entry = maybe_mkwrite(pte_mkyoung(pte_mkdirty(pte)),
-						vma);
-			ptep_set_access_flags(vma, address, page_table,
-						entry, 1);
-			update_mmu_cache(vma, address, entry);
-			pte_unmap(page_table);
-			spin_unlock(&mm->page_table_lock);
-			return VM_FAULT_MINOR;
-		}
+		if (reuse)
+			/* We can just make the PTE writable */
+			return do_wp_page_mk_pte_writable(mm, vma, address,
+							  page_table, old_page,
+							  pte);
 	}
 	pte_unmap(page_table);
 
@@ -1877,18 +1918,28 @@ retry:
 	/*
 	 * Should we do an early C-O-W break?
 	 */
-	if (write_access && !(vma->vm_flags & VM_SHARED)) {
-		struct page *page;
+	if (write_access) {
+		if (!(vma->vm_flags & VM_SHARED)) {
+			struct page *page;
 
-		if (unlikely(anon_vma_prepare(vma)))
-			goto oom;
-		page = alloc_page_vma(GFP_HIGHUSER, vma, address);
-		if (!page)
-			goto oom;
-		copy_user_highpage(page, new_page, address);
-		page_cache_release(new_page);
-		new_page = page;
-		anon = 1;
+			if (unlikely(anon_vma_prepare(vma)))
+				goto oom;
+			page = alloc_page_vma(GFP_HIGHUSER, vma, address);
+			if (!page)
+				goto oom;
+			copy_user_highpage(page, new_page, address);
+			page_cache_release(new_page);
+			new_page = page;
+			anon = 1;
+
+		} else {
+			/* if the page will be shareable, see if the backing
+			 * address space wants to know that the page is about
+			 * to become writable */
+			if (vma->vm_ops->page_mkwrite &&
+			    vma->vm_ops->page_mkwrite(vma, new_page) < 0)
+				return VM_FAULT_SIGBUS;
+		}
 	}
 
 	spin_lock(&mm->page_table_lock);
_